diff --git a/linux-tkg-patches/5.15/0008-5.15-bcachefs.patch b/linux-tkg-patches/5.15/0008-5.15-bcachefs.patch index 68cfa62..d722aad 100644 --- a/linux-tkg-patches/5.15/0008-5.15-bcachefs.patch +++ b/linux-tkg-patches/5.15/0008-5.15-bcachefs.patch @@ -1,5 +1,355 @@ +From 6032ed7e926efcff82d52458c7fd7a42c255cea1 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sun, 6 Nov 2022 10:52:14 +0100 +Subject: [PATCH] bcachefs-5.15 + +Signed-off-by: Peter Jung +--- + block/bio.c | 34 +- + block/blk-core.c | 13 +- + drivers/block/loop.c | 1 - + drivers/md/bcache/Kconfig | 10 +- + drivers/md/bcache/Makefile | 4 +- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/super.c | 1 - + drivers/md/bcache/util.h | 3 +- + fs/Kconfig | 1 + + fs/Makefile | 1 + + fs/bcachefs/Kconfig | 52 + + fs/bcachefs/Makefile | 65 + + fs/bcachefs/acl.c | 406 ++ + fs/bcachefs/acl.h | 58 + + fs/bcachefs/alloc_background.c | 1343 +++++++ + fs/bcachefs/alloc_background.h | 139 + + fs/bcachefs/alloc_foreground.c | 1263 ++++++ + fs/bcachefs/alloc_foreground.h | 173 + + fs/bcachefs/alloc_types.h | 87 + + fs/bcachefs/bcachefs.h | 974 +++++ + fs/bcachefs/bcachefs_format.h | 1986 ++++++++++ + fs/bcachefs/bcachefs_ioctl.h | 367 ++ + fs/bcachefs/bkey.c | 1172 ++++++ + fs/bcachefs/bkey.h | 566 +++ + fs/bcachefs/bkey_buf.h | 60 + + fs/bcachefs/bkey_methods.c | 463 +++ + fs/bcachefs/bkey_methods.h | 105 + + fs/bcachefs/bkey_sort.c | 198 + + fs/bcachefs/bkey_sort.h | 44 + + fs/bcachefs/bset.c | 1598 ++++++++ + fs/bcachefs/bset.h | 615 +++ + fs/bcachefs/btree_cache.c | 1160 ++++++ + fs/bcachefs/btree_cache.h | 107 + + fs/bcachefs/btree_gc.c | 2102 ++++++++++ + fs/bcachefs/btree_gc.h | 105 + + fs/bcachefs/btree_io.c | 2111 ++++++++++ + fs/bcachefs/btree_io.h | 222 ++ + fs/bcachefs/btree_iter.c | 3329 ++++++++++++++++ + fs/bcachefs/btree_iter.h | 406 ++ + fs/bcachefs/btree_key_cache.c | 743 ++++ + fs/bcachefs/btree_key_cache.h | 45 + + fs/bcachefs/btree_locking.h | 259 ++ + fs/bcachefs/btree_types.h | 713 ++++ + fs/bcachefs/btree_update.h | 141 + + fs/bcachefs/btree_update_interior.c | 2238 +++++++++++ + fs/bcachefs/btree_update_interior.h | 321 ++ + fs/bcachefs/btree_update_leaf.c | 1756 +++++++++ + fs/bcachefs/buckets.c | 2122 ++++++++++ + fs/bcachefs/buckets.h | 298 ++ + fs/bcachefs/buckets_types.h | 104 + + fs/bcachefs/buckets_waiting_for_journal.c | 167 + + fs/bcachefs/buckets_waiting_for_journal.h | 15 + + .../buckets_waiting_for_journal_types.h | 23 + + fs/bcachefs/chardev.c | 761 ++++ + fs/bcachefs/chardev.h | 31 + + fs/bcachefs/checksum.c | 665 ++++ + fs/bcachefs/checksum.h | 204 + + fs/bcachefs/clock.c | 191 + + fs/bcachefs/clock.h | 38 + + fs/bcachefs/clock_types.h | 37 + + fs/bcachefs/compress.c | 641 +++ + fs/bcachefs/compress.h | 18 + + fs/bcachefs/darray.h | 76 + + fs/bcachefs/debug.c | 628 +++ + fs/bcachefs/debug.h | 30 + + fs/bcachefs/dirent.c | 545 +++ + fs/bcachefs/dirent.h | 67 + + fs/bcachefs/disk_groups.c | 506 +++ + fs/bcachefs/disk_groups.h | 90 + + fs/bcachefs/ec.c | 1682 ++++++++ + fs/bcachefs/ec.h | 228 ++ + fs/bcachefs/ec_types.h | 46 + + fs/bcachefs/errcode.h | 12 + + fs/bcachefs/error.c | 185 + + fs/bcachefs/error.h | 238 ++ + fs/bcachefs/extent_update.c | 178 + + fs/bcachefs/extent_update.h | 12 + + fs/bcachefs/extents.c | 1259 ++++++ + fs/bcachefs/extents.h | 688 ++++ + fs/bcachefs/extents_types.h | 40 + + fs/bcachefs/eytzinger.h | 281 ++ + fs/bcachefs/fifo.h | 127 + + fs/bcachefs/fs-common.c | 494 +++ + fs/bcachefs/fs-common.h | 43 + + fs/bcachefs/fs-io.c | 3495 +++++++++++++++++ + fs/bcachefs/fs-io.h | 57 + + fs/bcachefs/fs-ioctl.c | 523 +++ + fs/bcachefs/fs-ioctl.h | 81 + + fs/bcachefs/fs.c | 1940 +++++++++ + fs/bcachefs/fs.h | 208 + + fs/bcachefs/fsck.c | 2356 +++++++++++ + fs/bcachefs/fsck.h | 8 + + fs/bcachefs/inode.c | 720 ++++ + fs/bcachefs/inode.h | 204 + + fs/bcachefs/io.c | 2416 ++++++++++++ + fs/bcachefs/io.h | 189 + + fs/bcachefs/io_types.h | 161 + + fs/bcachefs/journal.c | 1410 +++++++ + fs/bcachefs/journal.h | 522 +++ + fs/bcachefs/journal_io.c | 1700 ++++++++ + fs/bcachefs/journal_io.h | 60 + + fs/bcachefs/journal_reclaim.c | 847 ++++ + fs/bcachefs/journal_reclaim.h | 86 + + fs/bcachefs/journal_sb.c | 222 ++ + fs/bcachefs/journal_sb.h | 24 + + fs/bcachefs/journal_seq_blacklist.c | 322 ++ + fs/bcachefs/journal_seq_blacklist.h | 22 + + fs/bcachefs/journal_types.h | 340 ++ + fs/bcachefs/keylist.c | 67 + + fs/bcachefs/keylist.h | 76 + + fs/bcachefs/keylist_types.h | 16 + + fs/bcachefs/lru.c | 203 + + fs/bcachefs/lru.h | 17 + + fs/bcachefs/migrate.c | 196 + + fs/bcachefs/migrate.h | 7 + + fs/bcachefs/move.c | 1130 ++++++ + fs/bcachefs/move.h | 73 + + fs/bcachefs/move_types.h | 19 + + fs/bcachefs/movinggc.c | 424 ++ + fs/bcachefs/movinggc.h | 9 + + fs/bcachefs/opts.c | 560 +++ + fs/bcachefs/opts.h | 517 +++ + fs/bcachefs/quota.c | 852 ++++ + fs/bcachefs/quota.h | 71 + + fs/bcachefs/quota_types.h | 43 + + fs/bcachefs/rebalance.c | 349 ++ + fs/bcachefs/rebalance.h | 28 + + fs/bcachefs/rebalance_types.h | 26 + + fs/bcachefs/recovery.c | 1472 +++++++ + fs/bcachefs/recovery.h | 66 + + fs/bcachefs/reflink.c | 404 ++ + fs/bcachefs/reflink.h | 73 + + fs/bcachefs/replicas.c | 1073 +++++ + fs/bcachefs/replicas.h | 106 + + fs/bcachefs/replicas_types.h | 10 + + fs/bcachefs/siphash.c | 173 + + fs/bcachefs/siphash.h | 87 + + fs/bcachefs/str_hash.h | 351 ++ + fs/bcachefs/subvolume.c | 1075 +++++ + fs/bcachefs/subvolume.h | 124 + + fs/bcachefs/subvolume_types.h | 9 + + fs/bcachefs/super-io.c | 1601 ++++++++ + fs/bcachefs/super-io.h | 126 + + fs/bcachefs/super.c | 1966 ++++++++++ + fs/bcachefs/super.h | 264 ++ + fs/bcachefs/super_types.h | 51 + + fs/bcachefs/sysfs.c | 889 +++++ + fs/bcachefs/sysfs.h | 44 + + fs/bcachefs/tests.c | 947 +++++ + fs/bcachefs/tests.h | 15 + + fs/bcachefs/trace.c | 12 + + fs/bcachefs/util.c | 984 +++++ + fs/bcachefs/util.h | 877 +++++ + fs/bcachefs/varint.c | 120 + + fs/bcachefs/varint.h | 11 + + fs/bcachefs/vstructs.h | 63 + + fs/bcachefs/xattr.c | 629 +++ + fs/bcachefs/xattr.h | 50 + + fs/dcache.c | 10 +- + fs/inode.c | 218 +- + include/linux/bio.h | 7 +- + include/linux/blkdev.h | 1 + + .../md/bcache => include/linux}/closure.h | 39 +- + include/linux/compiler_attributes.h | 5 + + include/linux/dcache.h | 1 + + include/linux/exportfs.h | 6 + + include/linux/fs.h | 9 +- + include/linux/generic-radix-tree.h | 6 + + include/linux/list_bl.h | 22 + + include/linux/lockdep.h | 4 + + include/linux/sched.h | 1 + + include/linux/six.h | 203 + + include/linux/vmalloc.h | 1 + + include/trace/events/bcachefs.h | 1034 +++++ + init/init_task.c | 1 + + kernel/Kconfig.locks | 3 + + kernel/locking/Makefile | 1 + + kernel/locking/lockdep.c | 20 + + kernel/locking/six.c | 759 ++++ + kernel/module.c | 4 +- + lib/Kconfig | 3 + + lib/Kconfig.debug | 9 + + lib/Makefile | 2 + + {drivers/md/bcache => lib}/closure.c | 35 +- + lib/generic-radix-tree.c | 17 +- + mm/filemap.c | 1 + + mm/nommu.c | 18 + + mm/vmalloc.c | 21 + + 188 files changed, 78910 insertions(+), 151 deletions(-) + create mode 100644 fs/bcachefs/Kconfig + create mode 100644 fs/bcachefs/Makefile + create mode 100644 fs/bcachefs/acl.c + create mode 100644 fs/bcachefs/acl.h + create mode 100644 fs/bcachefs/alloc_background.c + create mode 100644 fs/bcachefs/alloc_background.h + create mode 100644 fs/bcachefs/alloc_foreground.c + create mode 100644 fs/bcachefs/alloc_foreground.h + create mode 100644 fs/bcachefs/alloc_types.h + create mode 100644 fs/bcachefs/bcachefs.h + create mode 100644 fs/bcachefs/bcachefs_format.h + create mode 100644 fs/bcachefs/bcachefs_ioctl.h + create mode 100644 fs/bcachefs/bkey.c + create mode 100644 fs/bcachefs/bkey.h + create mode 100644 fs/bcachefs/bkey_buf.h + create mode 100644 fs/bcachefs/bkey_methods.c + create mode 100644 fs/bcachefs/bkey_methods.h + create mode 100644 fs/bcachefs/bkey_sort.c + create mode 100644 fs/bcachefs/bkey_sort.h + create mode 100644 fs/bcachefs/bset.c + create mode 100644 fs/bcachefs/bset.h + create mode 100644 fs/bcachefs/btree_cache.c + create mode 100644 fs/bcachefs/btree_cache.h + create mode 100644 fs/bcachefs/btree_gc.c + create mode 100644 fs/bcachefs/btree_gc.h + create mode 100644 fs/bcachefs/btree_io.c + create mode 100644 fs/bcachefs/btree_io.h + create mode 100644 fs/bcachefs/btree_iter.c + create mode 100644 fs/bcachefs/btree_iter.h + create mode 100644 fs/bcachefs/btree_key_cache.c + create mode 100644 fs/bcachefs/btree_key_cache.h + create mode 100644 fs/bcachefs/btree_locking.h + create mode 100644 fs/bcachefs/btree_types.h + create mode 100644 fs/bcachefs/btree_update.h + create mode 100644 fs/bcachefs/btree_update_interior.c + create mode 100644 fs/bcachefs/btree_update_interior.h + create mode 100644 fs/bcachefs/btree_update_leaf.c + create mode 100644 fs/bcachefs/buckets.c + create mode 100644 fs/bcachefs/buckets.h + create mode 100644 fs/bcachefs/buckets_types.h + create mode 100644 fs/bcachefs/buckets_waiting_for_journal.c + create mode 100644 fs/bcachefs/buckets_waiting_for_journal.h + create mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h + create mode 100644 fs/bcachefs/chardev.c + create mode 100644 fs/bcachefs/chardev.h + create mode 100644 fs/bcachefs/checksum.c + create mode 100644 fs/bcachefs/checksum.h + create mode 100644 fs/bcachefs/clock.c + create mode 100644 fs/bcachefs/clock.h + create mode 100644 fs/bcachefs/clock_types.h + create mode 100644 fs/bcachefs/compress.c + create mode 100644 fs/bcachefs/compress.h + create mode 100644 fs/bcachefs/darray.h + create mode 100644 fs/bcachefs/debug.c + create mode 100644 fs/bcachefs/debug.h + create mode 100644 fs/bcachefs/dirent.c + create mode 100644 fs/bcachefs/dirent.h + create mode 100644 fs/bcachefs/disk_groups.c + create mode 100644 fs/bcachefs/disk_groups.h + create mode 100644 fs/bcachefs/ec.c + create mode 100644 fs/bcachefs/ec.h + create mode 100644 fs/bcachefs/ec_types.h + create mode 100644 fs/bcachefs/errcode.h + create mode 100644 fs/bcachefs/error.c + create mode 100644 fs/bcachefs/error.h + create mode 100644 fs/bcachefs/extent_update.c + create mode 100644 fs/bcachefs/extent_update.h + create mode 100644 fs/bcachefs/extents.c + create mode 100644 fs/bcachefs/extents.h + create mode 100644 fs/bcachefs/extents_types.h + create mode 100644 fs/bcachefs/eytzinger.h + create mode 100644 fs/bcachefs/fifo.h + create mode 100644 fs/bcachefs/fs-common.c + create mode 100644 fs/bcachefs/fs-common.h + create mode 100644 fs/bcachefs/fs-io.c + create mode 100644 fs/bcachefs/fs-io.h + create mode 100644 fs/bcachefs/fs-ioctl.c + create mode 100644 fs/bcachefs/fs-ioctl.h + create mode 100644 fs/bcachefs/fs.c + create mode 100644 fs/bcachefs/fs.h + create mode 100644 fs/bcachefs/fsck.c + create mode 100644 fs/bcachefs/fsck.h + create mode 100644 fs/bcachefs/inode.c + create mode 100644 fs/bcachefs/inode.h + create mode 100644 fs/bcachefs/io.c + create mode 100644 fs/bcachefs/io.h + create mode 100644 fs/bcachefs/io_types.h + create mode 100644 fs/bcachefs/journal.c + create mode 100644 fs/bcachefs/journal.h + create mode 100644 fs/bcachefs/journal_io.c + create mode 100644 fs/bcachefs/journal_io.h + create mode 100644 fs/bcachefs/journal_reclaim.c + create mode 100644 fs/bcachefs/journal_reclaim.h + create mode 100644 fs/bcachefs/journal_sb.c + create mode 100644 fs/bcachefs/journal_sb.h + create mode 100644 fs/bcachefs/journal_seq_blacklist.c + create mode 100644 fs/bcachefs/journal_seq_blacklist.h + create mode 100644 fs/bcachefs/journal_types.h + create mode 100644 fs/bcachefs/keylist.c + create mode 100644 fs/bcachefs/keylist.h + create mode 100644 fs/bcachefs/keylist_types.h + create mode 100644 fs/bcachefs/lru.c + create mode 100644 fs/bcachefs/lru.h + create mode 100644 fs/bcachefs/migrate.c + create mode 100644 fs/bcachefs/migrate.h + create mode 100644 fs/bcachefs/move.c + create mode 100644 fs/bcachefs/move.h + create mode 100644 fs/bcachefs/move_types.h + create mode 100644 fs/bcachefs/movinggc.c + create mode 100644 fs/bcachefs/movinggc.h + create mode 100644 fs/bcachefs/opts.c + create mode 100644 fs/bcachefs/opts.h + create mode 100644 fs/bcachefs/quota.c + create mode 100644 fs/bcachefs/quota.h + create mode 100644 fs/bcachefs/quota_types.h + create mode 100644 fs/bcachefs/rebalance.c + create mode 100644 fs/bcachefs/rebalance.h + create mode 100644 fs/bcachefs/rebalance_types.h + create mode 100644 fs/bcachefs/recovery.c + create mode 100644 fs/bcachefs/recovery.h + create mode 100644 fs/bcachefs/reflink.c + create mode 100644 fs/bcachefs/reflink.h + create mode 100644 fs/bcachefs/replicas.c + create mode 100644 fs/bcachefs/replicas.h + create mode 100644 fs/bcachefs/replicas_types.h + create mode 100644 fs/bcachefs/siphash.c + create mode 100644 fs/bcachefs/siphash.h + create mode 100644 fs/bcachefs/str_hash.h + create mode 100644 fs/bcachefs/subvolume.c + create mode 100644 fs/bcachefs/subvolume.h + create mode 100644 fs/bcachefs/subvolume_types.h + create mode 100644 fs/bcachefs/super-io.c + create mode 100644 fs/bcachefs/super-io.h + create mode 100644 fs/bcachefs/super.c + create mode 100644 fs/bcachefs/super.h + create mode 100644 fs/bcachefs/super_types.h + create mode 100644 fs/bcachefs/sysfs.c + create mode 100644 fs/bcachefs/sysfs.h + create mode 100644 fs/bcachefs/tests.c + create mode 100644 fs/bcachefs/tests.h + create mode 100644 fs/bcachefs/trace.c + create mode 100644 fs/bcachefs/util.c + create mode 100644 fs/bcachefs/util.h + create mode 100644 fs/bcachefs/varint.c + create mode 100644 fs/bcachefs/varint.h + create mode 100644 fs/bcachefs/vstructs.h + create mode 100644 fs/bcachefs/xattr.c + create mode 100644 fs/bcachefs/xattr.h + rename {drivers/md/bcache => include/linux}/closure.h (94%) + create mode 100644 include/linux/six.h + create mode 100644 include/trace/events/bcachefs.h + create mode 100644 kernel/locking/six.c + rename {drivers/md/bcache => lib}/closure.c (88%) + diff --git a/block/bio.c b/block/bio.c -index a6fb6a0b4295..3c9cc0000168 100644 +index ba9120d4fe49..ba076c1547ff 100644 --- a/block/bio.c +++ b/block/bio.c @@ -526,15 +526,15 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) @@ -21,7 +371,7 @@ index a6fb6a0b4295..3c9cc0000168 100644 /** * bio_truncate - truncate the bio to small size of @new_size -@@ -1284,17 +1284,28 @@ EXPORT_SYMBOL(bio_advance); +@@ -1265,17 +1265,27 @@ EXPORT_SYMBOL(bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { @@ -46,19 +396,18 @@ index a6fb6a0b4295..3c9cc0000168 100644 + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, + bytes); -+ + +- memcpy(dst_buf, src_buf, bytes); + kunmap_atomic(dst_p); + kunmap_atomic(src_p); - -- memcpy(dst_buf, src_buf, bytes); - + - kunmap_local(dst_buf); - kunmap_local(src_buf); + flush_dcache_page(dst_bv.bv_page); - + bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); -@@ -1366,6 +1378,7 @@ void bio_set_pages_dirty(struct bio *bio) +@@ -1349,6 +1359,7 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } @@ -66,7 +415,7 @@ index a6fb6a0b4295..3c9cc0000168 100644 /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. -@@ -1425,6 +1438,7 @@ void bio_check_pages_dirty(struct bio *bio) +@@ -1408,6 +1419,7 @@ void bio_check_pages_dirty(struct bio *bio) spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } @@ -75,10 +424,10 @@ index a6fb6a0b4295..3c9cc0000168 100644 static inline bool bio_remaining_done(struct bio *bio) { diff --git a/block/blk-core.c b/block/blk-core.c -index 4d8f5fe91588..96cf713f03a7 100644 +index 13e1fca1e923..99576a4c0bb1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c -@@ -214,18 +214,23 @@ int blk_status_to_errno(blk_status_t status) +@@ -215,18 +215,23 @@ int blk_status_to_errno(blk_status_t status) } EXPORT_SYMBOL_GPL(blk_status_to_errno); @@ -107,10 +456,10 @@ index 4d8f5fe91588..96cf713f03a7 100644 blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), req->cmd_flags & ~REQ_OP_MASK, diff --git a/drivers/block/loop.c b/drivers/block/loop.c -index 7bf4686af774..80511131b884 100644 +index 79e485949b60..f5036056a430 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c -@@ -1412,7 +1412,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) +@@ -1410,7 +1410,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; lo_number = lo->lo_number; @@ -180,10 +529,10 @@ index 5fc989a6d452..e5e147d0e49a 100644 struct bucket { atomic_t pin; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index f2874c77ff79..7217e05107bf 100644 +index af4fa8071cbc..46cae9a7f7fb 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c -@@ -2910,7 +2910,6 @@ static int __init bcache_init(void) +@@ -2911,7 +2911,6 @@ static int __init bcache_init(void) goto err; bch_debug_init(); @@ -296,10 +645,10 @@ index 000000000000..27742ce276cd + Include some unit and performance tests for the core btree code diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile new file mode 100644 -index 000000000000..71cda24e6d08 +index 000000000000..7ddae26116a0 --- /dev/null +++ b/fs/bcachefs/Makefile -@@ -0,0 +1,62 @@ +@@ -0,0 +1,65 @@ + +obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o + @@ -318,6 +667,7 @@ index 000000000000..71cda24e6d08 + btree_update_interior.o \ + btree_update_leaf.o \ + buckets.o \ ++ buckets_waiting_for_journal.o \ + chardev.o \ + checksum.o \ + clock.o \ @@ -339,8 +689,10 @@ index 000000000000..71cda24e6d08 + journal.o \ + journal_io.o \ + journal_reclaim.o \ ++ journal_sb.o \ + journal_seq_blacklist.o \ + keylist.o \ ++ lru.o \ + migrate.o \ + move.o \ + movinggc.o \ @@ -840,10 +1192,10 @@ index 000000000000..2d76a4897ba8 +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 -index 000000000000..b2735c8591d6 +index 000000000000..e8a34eccac25 --- /dev/null +++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,1312 @@ +@@ -0,0 +1,1343 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -855,10 +1207,12 @@ index 000000000000..b2735c8591d6 +#include "btree_update_interior.h" +#include "btree_gc.h" +#include "buckets.h" ++#include "buckets_waiting_for_journal.h" +#include "clock.h" +#include "debug.h" +#include "ec.h" +#include "error.h" ++#include "lru.h" +#include "recovery.h" +#include "varint.h" + @@ -871,12 +1225,7 @@ index 000000000000..b2735c8591d6 +#include +#include + -+const char * const bch2_allocator_states[] = { -+#define x(n) #n, -+ ALLOC_THREAD_STATES() -+#undef x -+ NULL -+}; ++/* Persistent alloc info: */ + +static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { +#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, @@ -884,7 +1233,28 @@ index 000000000000..b2735c8591d6 +#undef x +}; + -+/* Persistent alloc info: */ ++const char * const bch2_bucket_states[] = { ++ "free", ++ "need gc gens", ++ "need discard", ++ "cached", ++ "dirty", ++ NULL ++}; ++ ++struct bkey_alloc_unpacked { ++ u64 journal_seq; ++ u64 bucket; ++ u8 dev; ++ u8 gen; ++ u8 oldest_gen; ++ u8 data_type; ++ bool need_discard:1; ++ bool need_inc_gen:1; ++#define x(_name, _bits) u##_bits _name; ++ BCH_ALLOC_FIELDS_V2() ++#undef x ++}; + +static inline u64 alloc_field_v1_get(const struct bch_alloc *a, + const void **p, unsigned field) @@ -1006,6 +1376,8 @@ index 000000000000..b2735c8591d6 + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; ++ out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); ++ out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); + out->journal_seq = le64_to_cpu(a.v->journal_seq); + +#define x(_name, _bits) \ @@ -1027,47 +1399,7 @@ index 000000000000..b2735c8591d6 + return 0; +} + -+static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, -+ const struct bkey_alloc_unpacked src) -+{ -+ struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k); -+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; -+ u8 *out = a->v.data; -+ u8 *end = (void *) &dst[1]; -+ u8 *last_nonzero_field = out; -+ unsigned bytes; -+ -+ a->k.p = POS(src.dev, src.bucket); -+ a->v.gen = src.gen; -+ a->v.oldest_gen = src.oldest_gen; -+ a->v.data_type = src.data_type; -+ a->v.journal_seq = cpu_to_le64(src.journal_seq); -+ -+#define x(_name, _bits) \ -+ nr_fields++; \ -+ \ -+ if (src._name) { \ -+ out += bch2_varint_encode_fast(out, src._name); \ -+ \ -+ last_nonzero_field = out; \ -+ last_nonzero_fieldnr = nr_fields; \ -+ } else { \ -+ *out++ = 0; \ -+ } -+ -+ BCH_ALLOC_FIELDS_V2() -+#undef x -+ BUG_ON(out > end); -+ -+ out = last_nonzero_field; -+ a->v.nr_fields = last_nonzero_fieldnr; -+ -+ bytes = (u8 *) out - (u8 *) &a->v; -+ set_bkey_val_bytes(&a->k, bytes); -+ memset_u64s_tail(&a->v, 0, bytes); -+} -+ -+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +{ + struct bkey_alloc_unpacked ret = { + .dev = k.k->p.inode, @@ -1090,11 +1422,71 @@ index 000000000000..b2735c8591d6 + return ret; +} + -+void bch2_alloc_pack(struct bch_fs *c, -+ struct bkey_alloc_buf *dst, -+ const struct bkey_alloc_unpacked src) ++void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) +{ -+ bch2_alloc_pack_v3(dst, src); ++ if (k.k->type == KEY_TYPE_alloc_v4) { ++ *out = *bkey_s_c_to_alloc_v4(k).v; ++ } else { ++ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); ++ ++ *out = (struct bch_alloc_v4) { ++ .journal_seq = u.journal_seq, ++ .flags = u.need_discard, ++ .gen = u.gen, ++ .oldest_gen = u.oldest_gen, ++ .data_type = u.data_type, ++ .stripe_redundancy = u.stripe_redundancy, ++ .dirty_sectors = u.dirty_sectors, ++ .cached_sectors = u.cached_sectors, ++ .io_time[READ] = u.read_time, ++ .io_time[WRITE] = u.write_time, ++ .stripe = u.stripe, ++ }; ++ } ++} ++ ++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ struct bkey_i_alloc_v4 *ret; ++ ++ if (k.k->type == KEY_TYPE_alloc_v4) { ++ ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if (!IS_ERR(ret)) ++ bkey_reassemble(&ret->k_i, k); ++ } else { ++ ret = bch2_trans_kmalloc(trans, sizeof(*ret)); ++ if (!IS_ERR(ret)) { ++ bkey_alloc_v4_init(&ret->k_i); ++ ret->k.p = k.k->p; ++ bch2_alloc_to_v4(k, &ret->v); ++ } ++ } ++ return ret; ++} ++ ++struct bkey_i_alloc_v4 * ++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos pos) ++{ ++ struct bkey_s_c k; ++ struct bkey_i_alloc_v4 *a; ++ int ret; ++ ++ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return ERR_PTR(ret); ++ } ++ ++ a = bch2_alloc_to_v4_mut(trans, k); ++ if (IS_ERR(a)) ++ bch2_trans_iter_exit(trans, iter); ++ return a; +} + +static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) @@ -1140,150 +1532,805 @@ index 000000000000..b2735c8591d6 +const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_alloc_unpacked u; ++ struct bch_dev *ca; + + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; + ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ if (k.k->p.offset < ca->mi.first_bucket || ++ k.k->p.offset >= ca->mi.nbuckets) ++ return "invalid bucket"; ++ + if (bch2_alloc_unpack_v3(&u, k)) + return "unpack error"; + + return NULL; +} + -+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) ++const char *bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ -+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); -+ -+ pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu", -+ u.gen, u.oldest_gen, bch2_data_types[u.data_type], -+ u.journal_seq); -+#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); -+ BCH_ALLOC_FIELDS_V2() -+#undef x -+} -+ -+static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; + struct bch_dev *ca; -+ struct bucket *g; -+ struct bkey_alloc_unpacked u; + -+ if (!bkey_is_alloc(k.k)) -+ return 0; ++ if (k.k->p.inode >= c->sb.nr_devices || ++ !c->devs[k.k->p.inode]) ++ return "invalid device"; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ g = bucket(ca, k.k->p.offset); -+ u = bch2_alloc_unpack(k); + -+ g->_mark.gen = u.gen; -+ g->_mark.data_type = u.data_type; -+ g->_mark.dirty_sectors = u.dirty_sectors; -+ g->_mark.cached_sectors = u.cached_sectors; -+ g->io_time[READ] = u.read_time; -+ g->io_time[WRITE] = u.write_time; -+ g->oldest_gen = u.oldest_gen; -+ g->gen_valid = 1; ++ if (k.k->p.offset < ca->mi.first_bucket || ++ k.k->p.offset >= ca->mi.nbuckets) ++ return "invalid bucket"; + -+ return 0; ++ return NULL; ++} ++ ++void bch2_alloc_v4_swab(struct bkey_s k) ++{ ++ struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; ++ ++ a->journal_seq = swab64(a->journal_seq); ++ a->flags = swab32(a->flags); ++ a->dirty_sectors = swab32(a->dirty_sectors); ++ a->cached_sectors = swab32(a->cached_sectors); ++ a->io_time[0] = swab64(a->io_time[0]); ++ a->io_time[1] = swab64(a->io_time[1]); ++ a->stripe = swab32(a->stripe); ++ a->nr_external_backpointers = swab32(a->nr_external_backpointers); ++} ++ ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bch_alloc_v4 a; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu", ++ a.gen, a.oldest_gen, bch2_data_types[a.data_type], ++ a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a)); ++ pr_buf(out, " dirty_sectors %u", a.dirty_sectors); ++ pr_buf(out, " cached_sectors %u", a.cached_sectors); ++ pr_buf(out, " stripe %u", a.stripe); ++ pr_buf(out, " stripe_redundancy %u", a.stripe_redundancy); ++ pr_buf(out, " read_time %llu", a.io_time[READ]); ++ pr_buf(out, " write_time %llu", a.io_time[WRITE]); +} + +int bch2_alloc_read(struct bch_fs *c) +{ + struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_alloc_v4 a; ++ struct bch_dev *ca; + int ret; + + bch2_trans_init(&trans, c, 0, 0); -+ down_read(&c->gc_lock); -+ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn); -+ up_read(&c->gc_lock); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ bch2_alloc_to_v4(k, &a); ++ ++ *bucket_gen(ca, k.k->p.offset) = a.gen; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ + bch2_trans_exit(&trans); -+ if (ret) { ++ ++ if (ret) + bch_err(c, "error reading alloc info: %i", ret); -+ return ret; ++ ++ return ret; ++} ++ ++/* Free space/discard btree: */ ++ ++static int bch2_bucket_do_index(struct btree_trans *trans, ++ struct bkey_s_c alloc_k, ++ struct bch_alloc_v4 a, ++ bool set) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); ++ struct btree_iter iter; ++ struct bkey_s_c old; ++ struct bkey_i *k; ++ enum bucket_state state = bucket_state(a); ++ enum btree_id btree; ++ enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; ++ enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ if (state != BUCKET_free && ++ state != BUCKET_need_discard) ++ return 0; ++ ++ k = bch2_trans_kmalloc(trans, sizeof(*k)); ++ if (IS_ERR(k)) ++ return PTR_ERR(k); ++ ++ bkey_init(&k->k); ++ k->k.type = new_type; ++ ++ switch (state) { ++ case BUCKET_free: ++ btree = BTREE_ID_freespace; ++ k->k.p = alloc_freespace_pos(alloc_k.k->p, a); ++ bch2_key_resize(&k->k, 1); ++ break; ++ case BUCKET_need_discard: ++ btree = BTREE_ID_need_discard; ++ k->k.p = alloc_k.k->p; ++ break; ++ default: ++ return 0; ++ } ++ ++ bch2_trans_iter_init(trans, &iter, btree, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ old = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(old); ++ if (ret) ++ goto err; ++ ++ if (ca->mi.freespace_initialized && ++ bch2_fs_inconsistent_on(old.k->type != old_type, c, ++ "incorrect key when %s %s btree (got %s should be %s)\n" ++ " for %s", ++ set ? "setting" : "clearing", ++ bch2_btree_ids[btree], ++ bch2_bkey_types[old.k->type], ++ bch2_bkey_types[old_type], ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ ret = bch2_trans_update(trans, &iter, k, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_trans_mark_alloc(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_alloc_v4 old_a, *new_a; ++ u64 old_lru, new_lru; ++ int ret = 0; ++ ++ /* ++ * Deletion only happens in the device removal path, with ++ * BTREE_TRIGGER_NORUN: ++ */ ++ BUG_ON(new->k.type != KEY_TYPE_alloc_v4); ++ ++ bch2_alloc_to_v4(old, &old_a); ++ new_a = &bkey_i_to_alloc_v4(new)->v; ++ ++ if (new_a->dirty_sectors > old_a.dirty_sectors || ++ new_a->cached_sectors > old_a.cached_sectors) { ++ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); ++ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); ++ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); ++ } ++ ++ if (old_a.data_type && !new_a->data_type && ++ old_a.gen == new_a->gen && ++ !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { ++ new_a->gen++; ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); ++ } ++ ++ if (bucket_state(old_a) != bucket_state(*new_a) || ++ (bucket_state(*new_a) == BUCKET_free && ++ alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) { ++ ret = bch2_bucket_do_index(trans, old, old_a, false) ?: ++ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true); ++ if (ret) ++ return ret; ++ } ++ ++ old_lru = alloc_lru_idx(old_a); ++ new_lru = alloc_lru_idx(*new_a); ++ ++ if (old_lru != new_lru) { ++ ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, ++ old_lru, &new_lru); ++ if (ret) ++ return ret; ++ ++ if (new_lru && new_a->io_time[READ] != new_lru) ++ new_a->io_time[READ] = new_lru; + } + + return 0; +} + -+static int bch2_alloc_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned flags) ++static int bch2_check_alloc_key(struct btree_trans *trans, ++ struct btree_iter *alloc_iter) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ struct bch_dev *ca; -+ struct bucket *g; -+ struct bucket_mark m; -+ struct bkey_alloc_unpacked old_u, new_u; -+ struct bkey_alloc_buf a; ++ struct btree_iter discard_iter, freespace_iter, lru_iter; ++ struct bch_alloc_v4 a; ++ unsigned discard_key_type, freespace_key_type; ++ struct bkey_s_c alloc_k, k; ++ struct printbuf buf = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + int ret; -+retry: -+ bch2_trans_begin(trans); + -+ ret = bch2_btree_key_cache_flush(trans, -+ BTREE_ID_alloc, iter->pos); ++ alloc_k = bch2_btree_iter_peek(alloc_iter); ++ if (!alloc_k.k) ++ return 0; ++ ++ ret = bkey_err(alloc_k); + if (ret) -+ goto err; ++ return ret; + -+ k = bch2_btree_iter_peek_slot(iter); ++ bch2_alloc_to_v4(alloc_k, &a); ++ discard_key_type = bucket_state(a) == BUCKET_need_discard ++ ? KEY_TYPE_set : 0; ++ freespace_key_type = bucket_state(a) == BUCKET_free ++ ? KEY_TYPE_set : 0; ++ ++ bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, ++ alloc_k.k->p, 0); ++ bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, ++ alloc_freespace_pos(alloc_k.k->p, a), 0); ++ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, ++ POS(alloc_k.k->p.inode, a.io_time[READ]), 0); ++ ++ k = bch2_btree_iter_peek_slot(&discard_iter); + ret = bkey_err(k); + if (ret) + goto err; + -+ old_u = bch2_alloc_unpack(k); ++ if (fsck_err_on(k.k->type != discard_key_type, c, ++ "incorrect key in need_discard btree (got %s should be %s)\n" ++ " %s", ++ bch2_bkey_types[k.k->type], ++ bch2_bkey_types[discard_key_type], ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); + -+ percpu_down_read(&c->mark_lock); -+ ca = bch_dev_bkey_exists(c, iter->pos.inode); -+ g = bucket(ca, iter->pos.offset); -+ m = READ_ONCE(g->mark); -+ new_u = alloc_mem_to_key(iter, g, m); -+ percpu_up_read(&c->mark_lock); ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; + -+ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) -+ return 0; ++ bkey_init(&update->k); ++ update->k.type = discard_key_type; ++ update->k.p = discard_iter.pos; + -+ bch2_alloc_pack(c, &a, new_u); -+ ret = bch2_trans_update(trans, iter, &a.k, -+ BTREE_TRIGGER_NORUN) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|flags); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ return ret; -+} ++ ret = bch2_trans_update(trans, &discard_iter, update, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ goto err; ++ } + -+int bch2_alloc_write(struct bch_fs *c, unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; ++ k = bch2_btree_iter_peek_slot(&freespace_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; + -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ if (fsck_err_on(k.k->type != freespace_key_type, c, ++ "incorrect key in freespace btree (got %s should be %s)\n" ++ " %s", ++ bch2_bkey_types[k.k->type], ++ bch2_bkey_types[freespace_key_type], ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); + -+ for_each_member_device(ca, c, i) { -+ bch2_btree_iter_set_pos(&iter, -+ POS(ca->dev_idx, ca->mi.first_bucket)); ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; + -+ while (iter.pos.offset < ca->mi.nbuckets) { -+ ret = bch2_alloc_write_key(&trans, &iter, flags); -+ if (ret) { -+ percpu_ref_put(&ca->ref); ++ bkey_init(&update->k); ++ update->k.type = freespace_key_type; ++ update->k.p = freespace_iter.pos; ++ bch2_key_resize(&update->k, 1); ++ ++ ret = bch2_trans_update(trans, &freespace_iter, update, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ goto err; ++ } ++ ++ if (bucket_state(a) == BUCKET_cached) { ++ k = bch2_btree_iter_peek_slot(&lru_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(!a.io_time[READ], c, ++ "cached bucket with read_time 0\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || ++ fsck_err_on(k.k->type != KEY_TYPE_lru || ++ le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, ++ "incorrect/missing lru entry\n" ++ " %s\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), ++ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { ++ u64 read_time = a.io_time[READ]; ++ ++ if (!a.io_time[READ]) ++ a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ++ ++ ret = bch2_lru_change(trans, ++ alloc_k.k->p.inode, ++ alloc_k.k->p.offset, ++ 0, &a.io_time[READ]); ++ if (ret) + goto err; ++ ++ if (a.io_time[READ] != read_time) { ++ struct bkey_i_alloc_v4 *a_mut = ++ bch2_alloc_to_v4_mut(trans, alloc_k); ++ ret = PTR_ERR_OR_ZERO(a_mut); ++ if (ret) ++ goto err; ++ ++ a_mut->v.io_time[READ] = a.io_time[READ]; ++ ret = bch2_trans_update(trans, alloc_iter, ++ &a_mut->k_i, BTREE_TRIGGER_NORUN); ++ if (ret) ++ goto err; + } -+ bch2_btree_iter_advance(&iter); ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ goto err; + } + } +err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &lru_iter); ++ bch2_trans_iter_exit(trans, &freespace_iter); ++ bch2_trans_iter_exit(trans, &discard_iter); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) ++{ ++ struct bch_dev *ca; ++ ++ if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) ++ return false; ++ ++ ca = bch_dev_bkey_exists(c, pos.inode); ++ return pos.offset >= ca->mi.first_bucket && ++ pos.offset < ca->mi.nbuckets; ++} ++ ++static int bch2_check_freespace_key(struct btree_trans *trans, ++ struct btree_iter *freespace_iter, ++ bool initial) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter alloc_iter; ++ struct bkey_s_c k, freespace_k; ++ struct bch_alloc_v4 a; ++ u64 genbits; ++ struct bpos pos; ++ struct bkey_i *update; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ freespace_k = bch2_btree_iter_peek(freespace_iter); ++ if (!freespace_k.k) ++ return 1; ++ ++ ret = bkey_err(freespace_k); ++ if (ret) ++ return ret; ++ ++ pos = freespace_iter->pos; ++ pos.offset &= ~(~0ULL << 56); ++ genbits = freespace_iter->pos.offset & (~0ULL << 56); ++ ++ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); ++ ++ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, ++ "%llu:%llu set in freespace btree but device or bucket does not exist", ++ pos.inode, pos.offset)) ++ goto delete; ++ ++ k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (fsck_err_on(bucket_state(a) != BUCKET_free || ++ genbits != alloc_freespace_genbits(a), c, ++ "%s\n incorrectly set in freespace index (free %u, genbits %llu should be %llu)", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ bucket_state(a) == BUCKET_free, ++ genbits >> 56, alloc_freespace_genbits(a) >> 56)) ++ goto delete; ++out: ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ printbuf_exit(&buf); ++ return ret; ++delete: ++ update = bch2_trans_kmalloc(trans, sizeof(*update)); ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.p = freespace_iter->pos; ++ bch2_key_resize(&update->k, 1); ++ ++ ret = bch2_trans_update(trans, freespace_iter, update, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ goto out; ++} ++ ++int bch2_check_alloc_info(struct bch_fs *c, bool initial) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0, last_dev = -1; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->p.inode != last_dev) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ if (!ca->mi.freespace_initialized) { ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); ++ continue; ++ } ++ ++ last_dev = k.k->p.inode; ++ } ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_check_alloc_key(&trans, &iter)); ++ if (ret) ++ break; ++ } + bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ while (1) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_check_freespace_key(&trans, &iter, initial)); ++ if (ret) ++ break; ++ ++ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++err: + bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, ++ struct bch_dev *ca, bool *discard_done) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_alloc_v4 *a; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos, ++ BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ ++ a = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { ++ a->v.gen++; ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); ++ goto write; ++ } ++ ++ BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk); ++ ++ if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c, ++ "%s\n incorrectly set in need_discard btree", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (!*discard_done && ca->mi.discard && !c->opts.nochanges) { ++ /* ++ * This works without any other locks because this is the only ++ * thread that removes items from the need_discard tree ++ */ ++ bch2_trans_unlock(trans); ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ k.k->p.offset * ca->mi.bucket_size, ++ ca->mi.bucket_size, ++ GFP_KERNEL, 0); ++ *discard_done = true; ++ ++ ret = bch2_trans_relock(trans) ? 0 : -EINTR; ++ if (ret) ++ goto out; ++ } ++ ++ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); ++write: ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static void bch2_do_discards_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, discard_work); ++ struct bch_dev *ca = NULL; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_need_discard, ++ POS_MIN, 0, k, ret) { ++ bool discard_done = false; ++ ++ if (ca && k.k->p.inode != ca->dev_idx) { ++ percpu_ref_put(&ca->io_ref); ++ ca = NULL; ++ } ++ ++ if (!ca) { ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ ca = NULL; ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); ++ continue; ++ } ++ } ++ ++ seen++; ++ ++ if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) { ++ open++; ++ continue; ++ } ++ ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ k.k->p.inode, k.k->p.offset)) { ++ need_journal_commit++; ++ continue; ++ } ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_NOFAIL, ++ bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done)); ++ if (ret) ++ break; ++ ++ discarded++; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ca) ++ percpu_ref_put(&ca->io_ref); ++ ++ bch2_trans_exit(&trans); ++ ++ if (need_journal_commit * 2 > seen) ++ bch2_journal_flush_async(&c->journal, NULL); ++ ++ percpu_ref_put(&c->writes); ++ ++ trace_do_discards(c, seen, open, need_journal_commit, discarded, ret); ++} ++ ++void bch2_do_discards(struct bch_fs *c) ++{ ++ if (percpu_ref_tryget(&c->writes) && ++ !queue_work(system_long_wq, &c->discard_work)) ++ percpu_ref_put(&c->writes); ++} ++ ++static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter lru_iter, alloc_iter = { NULL }; ++ struct bkey_s_c k; ++ struct bkey_i_alloc_v4 *a; ++ u64 bucket, idx; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, ++ POS(ca->dev_idx, 0), 0); ++ k = bch2_btree_iter_peek(&lru_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ ++ if (!k.k || k.k->p.inode != ca->dev_idx) ++ goto out; ++ ++ if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c, ++ "non lru key in lru btree")) ++ goto out; ++ ++ idx = k.k->p.offset; ++ bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); ++ ++ a = bch2_trans_start_alloc_update(trans, &alloc_iter, ++ POS(ca->dev_idx, bucket)); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c, ++ "invalidating bucket with wrong lru idx (got %llu should be %llu", ++ idx, alloc_lru_idx(a->v))) ++ goto out; ++ ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); ++ a->v.gen++; ++ a->v.data_type = 0; ++ a->v.dirty_sectors = 0; ++ a->v.cached_sectors = 0; ++ a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ++ a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); ++ ++ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); ++out: ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ bch2_trans_iter_exit(trans, &lru_iter); ++ return ret; ++} ++ ++static void bch2_do_invalidates_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); ++ struct bch_dev *ca; ++ struct btree_trans trans; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_member_device(ca, c, i) ++ while (!ret && should_invalidate_buckets(ca)) ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_NOFAIL, ++ invalidate_one_bucket(&trans, ca)); ++ ++ bch2_trans_exit(&trans); ++ percpu_ref_put(&c->writes); ++} ++ ++void bch2_do_invalidates(struct bch_fs *c) ++{ ++ if (percpu_ref_tryget(&c->writes)) ++ queue_work(system_long_wq, &c->invalidate_work); ++} ++ ++static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_alloc_v4 a; ++ struct bch_member *m; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, ca->mi.first_bucket), ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (iter.pos.offset >= ca->mi.nbuckets) ++ break; ++ ++ bch2_alloc_to_v4(k, &a); ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW, ++ bch2_bucket_do_index(&trans, k, a, true)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) { ++ bch_err(ca, "error initializing free space: %i", ret); ++ return ret; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; ++ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_fs_freespace_init(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ bool doing_init = false; ++ ++ /* ++ * We can crash during the device add path, so we need to check this on ++ * every mount: ++ */ ++ ++ for_each_member_device(ca, c, i) { ++ if (ca->mi.freespace_initialized) ++ continue; ++ ++ if (!doing_init) { ++ bch_info(c, "initializing freespace"); ++ doing_init = true; ++ } ++ ++ ret = bch2_dev_freespace_init(c, ca); ++ if (ret) { ++ percpu_ref_put(&ca->ref); ++ return ret; ++ } ++ } ++ ++ if (doing_init) { ++ mutex_lock(&c->sb_lock); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch_verbose(c, "done initializing freespace"); ++ } ++ + return ret; +} + @@ -1293,620 +2340,29 @@ index 000000000000..b2735c8591d6 + size_t bucket_nr, int rw) +{ + struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + struct btree_iter iter; -+ struct bucket *g; -+ struct bkey_alloc_buf *a; -+ struct bkey_alloc_unpacked u; -+ u64 *time, now; ++ struct bkey_i_alloc_v4 *a; ++ u64 now; + int ret = 0; + -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ ret = bch2_btree_iter_traverse(&iter); -+ if (ret) -+ goto out; -+ -+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); ++ a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) -+ goto out; ++ return ret; + -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, bucket_nr); -+ u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark)); -+ percpu_up_read(&c->mark_lock); -+ -+ time = rw == READ ? &u.read_time : &u.write_time; + now = atomic64_read(&c->io_clock[rw].now); -+ if (*time == now) ++ if (a->v.io_time[rw] == now) + goto out; + -+ *time = now; ++ a->v.io_time[rw] = now; + -+ bch2_alloc_pack(c, a, u); -+ ret = bch2_trans_update(trans, &iter, &a->k, 0) ?: ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + -+/* Background allocator thread: */ -+ -+/* -+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens -+ * (marking them as invalidated on disk), then optionally issues discard -+ * commands to the newly free buckets, then puts them on the various freelists. -+ */ -+ -+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, -+ struct bucket_mark m) -+{ -+ u8 gc_gen; -+ -+ if (!is_available_bucket(m)) -+ return false; -+ -+ if (m.owned_by_allocator) -+ return false; -+ -+ if (ca->buckets_nouse && -+ test_bit(b, ca->buckets_nouse)) -+ return false; -+ -+ gc_gen = bucket_gc_gen(bucket(ca, b)); -+ -+ ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; -+ ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; -+ -+ return gc_gen < BUCKET_GC_GEN_MAX; -+} -+ -+/* -+ * Determines what order we're going to reuse buckets, smallest bucket_key() -+ * first. -+ */ -+ -+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, -+ u64 now, u64 last_seq_ondisk) -+{ -+ unsigned used = bucket_sectors_used(m); -+ -+ if (used) { -+ /* -+ * Prefer to keep buckets that have been read more recently, and -+ * buckets that have more data in them: -+ */ -+ u64 last_read = max_t(s64, 0, now - g->io_time[READ]); -+ u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); -+ -+ return -last_read_scaled; -+ } else { -+ /* -+ * Prefer to use buckets with smaller gc_gen so that we don't -+ * have to walk the btree and recalculate oldest_gen - but shift -+ * off the low bits so that buckets will still have equal sort -+ * keys when there's only a small difference, so that we can -+ * keep sequential buckets together: -+ */ -+ return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| -+ (bucket_gc_gen(g) >> 4); -+ } -+} -+ -+static inline int bucket_alloc_cmp(alloc_heap *h, -+ struct alloc_heap_entry l, -+ struct alloc_heap_entry r) -+{ -+ return cmp_int(l.key, r.key) ?: -+ cmp_int(r.nr, l.nr) ?: -+ cmp_int(l.bucket, r.bucket); -+} -+ -+static inline int bucket_idx_cmp(const void *_l, const void *_r) -+{ -+ const struct alloc_heap_entry *l = _l, *r = _r; -+ -+ return cmp_int(l->bucket, r->bucket); -+} -+ -+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ struct alloc_heap_entry e = { 0 }; -+ u64 now, last_seq_ondisk; -+ size_t b, i, nr = 0; -+ -+ down_read(&ca->bucket_lock); -+ -+ buckets = bucket_array(ca); -+ ca->alloc_heap.used = 0; -+ now = atomic64_read(&c->io_clock[READ].now); -+ last_seq_ondisk = c->journal.last_seq_ondisk; -+ -+ /* -+ * Find buckets with lowest read priority, by building a maxheap sorted -+ * by read priority and repeatedly replacing the maximum element until -+ * all buckets have been visited. -+ */ -+ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { -+ struct bucket *g = &buckets->b[b]; -+ struct bucket_mark m = READ_ONCE(g->mark); -+ unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); -+ -+ cond_resched(); -+ -+ if (!bch2_can_invalidate_bucket(ca, b, m)) -+ continue; -+ -+ if (e.nr && e.bucket + e.nr == b && e.key == key) { -+ e.nr++; -+ } else { -+ if (e.nr) -+ heap_add_or_replace(&ca->alloc_heap, e, -+ -bucket_alloc_cmp, NULL); -+ -+ e = (struct alloc_heap_entry) { -+ .bucket = b, -+ .nr = 1, -+ .key = key, -+ }; -+ } -+ } -+ -+ if (e.nr) -+ heap_add_or_replace(&ca->alloc_heap, e, -+ -bucket_alloc_cmp, NULL); -+ -+ for (i = 0; i < ca->alloc_heap.used; i++) -+ nr += ca->alloc_heap.data[i].nr; -+ -+ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { -+ nr -= ca->alloc_heap.data[0].nr; -+ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); -+ } -+ -+ up_read(&ca->bucket_lock); -+} -+ -+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket_mark m; -+ size_t b, start; -+ -+ if (ca->fifo_last_bucket < ca->mi.first_bucket || -+ ca->fifo_last_bucket >= ca->mi.nbuckets) -+ ca->fifo_last_bucket = ca->mi.first_bucket; -+ -+ start = ca->fifo_last_bucket; -+ -+ do { -+ ca->fifo_last_bucket++; -+ if (ca->fifo_last_bucket == ca->mi.nbuckets) -+ ca->fifo_last_bucket = ca->mi.first_bucket; -+ -+ b = ca->fifo_last_bucket; -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (bch2_can_invalidate_bucket(ca, b, m)) { -+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; -+ -+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ if (heap_full(&ca->alloc_heap)) -+ break; -+ } -+ -+ cond_resched(); -+ } while (ca->fifo_last_bucket != start); -+} -+ -+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket_mark m; -+ size_t checked, i; -+ -+ for (checked = 0; -+ checked < ca->mi.nbuckets / 2; -+ checked++) { -+ size_t b = bch2_rand_range(ca->mi.nbuckets - -+ ca->mi.first_bucket) + -+ ca->mi.first_bucket; -+ -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (bch2_can_invalidate_bucket(ca, b, m)) { -+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; -+ -+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ if (heap_full(&ca->alloc_heap)) -+ break; -+ } -+ -+ cond_resched(); -+ } -+ -+ sort(ca->alloc_heap.data, -+ ca->alloc_heap.used, -+ sizeof(ca->alloc_heap.data[0]), -+ bucket_idx_cmp, NULL); -+ -+ /* remove duplicates: */ -+ for (i = 0; i + 1 < ca->alloc_heap.used; i++) -+ if (ca->alloc_heap.data[i].bucket == -+ ca->alloc_heap.data[i + 1].bucket) -+ ca->alloc_heap.data[i].nr = 0; -+} -+ -+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ size_t i, nr = 0; -+ -+ ca->inc_gen_needs_gc = 0; -+ ca->inc_gen_really_needs_gc = 0; -+ -+ switch (ca->mi.replacement) { -+ case BCH_CACHE_REPLACEMENT_lru: -+ find_reclaimable_buckets_lru(c, ca); -+ break; -+ case BCH_CACHE_REPLACEMENT_fifo: -+ find_reclaimable_buckets_fifo(c, ca); -+ break; -+ case BCH_CACHE_REPLACEMENT_random: -+ find_reclaimable_buckets_random(c, ca); -+ break; -+ } -+ -+ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); -+ -+ for (i = 0; i < ca->alloc_heap.used; i++) -+ nr += ca->alloc_heap.data[i].nr; -+ -+ return nr; -+} -+ -+/* -+ * returns sequence number of most recent journal entry that updated this -+ * bucket: -+ */ -+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) -+{ -+ if (m.journal_seq_valid) { -+ u64 journal_seq = atomic64_read(&c->journal.seq); -+ u64 bucket_seq = journal_seq; -+ -+ bucket_seq &= ~((u64) U16_MAX); -+ bucket_seq |= m.journal_seq; -+ -+ if (bucket_seq > journal_seq) -+ bucket_seq -= 1 << 16; -+ -+ return bucket_seq; -+ } else { -+ return 0; -+ } -+} -+ -+static int bucket_invalidate_btree(struct btree_trans *trans, -+ struct bch_dev *ca, u64 b) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_alloc_buf *a; -+ struct bkey_alloc_unpacked u; -+ struct bucket *g; -+ struct bucket_mark m; -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, -+ POS(ca->dev_idx, b), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ -+ a = bch2_trans_kmalloc(trans, sizeof(*a)); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ goto err; -+ -+ ret = bch2_btree_iter_traverse(&iter); -+ if (ret) -+ goto err; -+ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, b); -+ m = READ_ONCE(g->mark); -+ u = alloc_mem_to_key(&iter, g, m); -+ percpu_up_read(&c->mark_lock); -+ -+ u.gen++; -+ u.data_type = 0; -+ u.dirty_sectors = 0; -+ u.cached_sectors = 0; -+ u.read_time = atomic64_read(&c->io_clock[READ].now); -+ u.write_time = atomic64_read(&c->io_clock[WRITE].now); -+ -+ bch2_alloc_pack(c, a, u); -+ ret = bch2_trans_update(trans, &iter, &a->k, -+ BTREE_TRIGGER_BUCKET_INVALIDATE); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, -+ u64 *journal_seq, unsigned flags) -+{ -+ struct bucket *g; -+ struct bucket_mark m; -+ size_t b; -+ int ret = 0; -+ -+ BUG_ON(!ca->alloc_heap.used || -+ !ca->alloc_heap.data[0].nr); -+ b = ca->alloc_heap.data[0].bucket; -+ -+ /* first, put on free_inc and mark as owned by allocator: */ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, b); -+ m = READ_ONCE(g->mark); -+ -+ BUG_ON(m.dirty_sectors); -+ -+ bch2_mark_alloc_bucket(c, ca, b, true); -+ -+ spin_lock(&c->freelist_lock); -+ verify_not_on_freelist(c, ca, b); -+ BUG_ON(!fifo_push(&ca->free_inc, b)); -+ spin_unlock(&c->freelist_lock); -+ -+ /* -+ * If we're not invalidating cached data, we only increment the bucket -+ * gen in memory here, the incremented gen will be updated in the btree -+ * by bch2_trans_mark_pointer(): -+ */ -+ if (!m.cached_sectors && -+ !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { -+ BUG_ON(m.data_type); -+ bucket_cmpxchg(g, m, m.gen++); -+ percpu_up_read(&c->mark_lock); -+ goto out; -+ } -+ -+ percpu_up_read(&c->mark_lock); -+ -+ /* -+ * If the read-only path is trying to shut down, we can't be generating -+ * new btree updates: -+ */ -+ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { -+ ret = 1; -+ goto out; -+ } -+ -+ ret = bch2_trans_do(c, NULL, journal_seq, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_JOURNAL_RESERVED| -+ flags, -+ bucket_invalidate_btree(&trans, ca, b)); -+out: -+ if (!ret) { -+ /* remove from alloc_heap: */ -+ struct alloc_heap_entry e, *top = ca->alloc_heap.data; -+ -+ top->bucket++; -+ top->nr--; -+ -+ if (!top->nr) -+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ -+ /* -+ * Make sure we flush the last journal entry that updated this -+ * bucket (i.e. deleting the last reference) before writing to -+ * this bucket again: -+ */ -+ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); -+ } else { -+ size_t b2; -+ -+ /* remove from free_inc: */ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->freelist_lock); -+ -+ bch2_mark_alloc_bucket(c, ca, b, false); -+ -+ BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); -+ BUG_ON(b != b2); -+ -+ spin_unlock(&c->freelist_lock); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ return ret < 0 ? ret : 0; -+} -+ -+/* -+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: -+ */ -+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ u64 journal_seq = 0; -+ int ret = 0; -+ -+ /* Only use nowait if we've already invalidated at least one bucket: */ -+ while (!ret && -+ !fifo_full(&ca->free_inc) && -+ ca->alloc_heap.used) { -+ if (kthread_should_stop()) { -+ ret = 1; -+ break; -+ } -+ -+ ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, -+ (!fifo_empty(&ca->free_inc) -+ ? BTREE_INSERT_NOWAIT : 0)); -+ /* -+ * We only want to batch up invalidates when they're going to -+ * require flushing the journal: -+ */ -+ if (!journal_seq) -+ break; -+ } -+ -+ /* If we used NOWAIT, don't return the error: */ -+ if (!fifo_empty(&ca->free_inc)) -+ ret = 0; -+ if (ret < 0) -+ bch_err(ca, "error invalidating buckets: %i", ret); -+ if (ret) -+ return ret; -+ -+ if (journal_seq) -+ ret = bch2_journal_flush_seq(&c->journal, journal_seq); -+ if (ret) { -+ bch_err(ca, "journal error: %i", ret); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) -+{ -+ if (ca->allocator_state != new_state) { -+ ca->allocator_state = new_state; -+ closure_wake_up(&ca->fs->freelist_wait); -+ } -+} -+ -+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -+{ -+ unsigned i; -+ int ret = 0; -+ -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) { -+ /* -+ * Don't strand buckets on the copygc freelist until -+ * after recovery is finished: -+ */ -+ if (i == RESERVE_MOVINGGC && -+ !test_bit(BCH_FS_STARTED, &c->flags)) -+ continue; -+ -+ if (fifo_push(&ca->free[i], b)) { -+ fifo_pop(&ca->free_inc, b); -+ ret = 1; -+ break; -+ } -+ } -+ spin_unlock(&c->freelist_lock); -+ -+ ca->allocator_state = ret -+ ? ALLOCATOR_running -+ : ALLOCATOR_blocked_full; -+ closure_wake_up(&c->freelist_wait); -+ return ret; -+} -+ -+static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -+{ -+ if (ca->mi.discard && -+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) -+ blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), -+ ca->mi.bucket_size, GFP_NOFS, 0); -+} -+ -+static bool allocator_thread_running(struct bch_dev *ca) -+{ -+ unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && -+ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) -+ ? ALLOCATOR_running -+ : ALLOCATOR_stopped; -+ alloc_thread_set_state(ca, state); -+ return state == ALLOCATOR_running; -+} -+ -+static int buckets_available(struct bch_dev *ca, unsigned long gc_count) -+{ -+ s64 available = dev_buckets_reclaimable(ca) - -+ (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); -+ bool ret = available > 0; -+ -+ alloc_thread_set_state(ca, ret -+ ? ALLOCATOR_running -+ : ALLOCATOR_blocked); -+ return ret; -+} -+ -+/** -+ * bch_allocator_thread - move buckets from free_inc to reserves -+ * -+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and -+ * the reserves are depleted by bucket allocation. When we run out -+ * of free_inc, try to invalidate some buckets and write out -+ * prios and gens. -+ */ -+static int bch2_allocator_thread(void *arg) -+{ -+ struct bch_dev *ca = arg; -+ struct bch_fs *c = ca->fs; -+ unsigned long gc_count = c->gc_count; -+ size_t nr; -+ int ret; -+ -+ set_freezable(); -+ -+ while (1) { -+ ret = kthread_wait_freezable(allocator_thread_running(ca)); -+ if (ret) -+ goto stop; -+ -+ while (!ca->alloc_heap.used) { -+ cond_resched(); -+ -+ ret = kthread_wait_freezable(buckets_available(ca, gc_count)); -+ if (ret) -+ goto stop; -+ -+ gc_count = c->gc_count; -+ nr = find_reclaimable_buckets(c, ca); -+ -+ trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, -+ ca->inc_gen_really_needs_gc); -+ -+ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || -+ ca->inc_gen_really_needs_gc) && -+ c->gc_thread) { -+ atomic_inc(&c->kick_gc); -+ wake_up_process(c->gc_thread); -+ } -+ } -+ -+ ret = bch2_invalidate_buckets(c, ca); -+ if (ret) -+ goto stop; -+ -+ while (!fifo_empty(&ca->free_inc)) { -+ u64 b = fifo_peek(&ca->free_inc); -+ -+ discard_one_bucket(c, ca, b); -+ -+ ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); -+ if (ret) -+ goto stop; -+ } -+ } -+stop: -+ alloc_thread_set_state(ca, ALLOCATOR_stopped); -+ return 0; -+} -+ +/* Startup/shutdown (ro/rw): */ + +void bch2_recalc_capacity(struct bch_fs *c) @@ -1915,7 +2371,7 @@ index 000000000000..b2735c8591d6 + u64 capacity = 0, reserved_sectors = 0, gc_reserve; + unsigned bucket_size_max = 0; + unsigned long ra_pages = 0; -+ unsigned i, j; ++ unsigned i; + + lockdep_assert_held(&c->state_lock); + @@ -1946,8 +2402,9 @@ index 000000000000..b2735c8591d6 + * allocations for foreground writes must wait - + * not -ENOSPC calculations. + */ -+ for (j = 0; j < RESERVE_NONE; j++) -+ dev_reserve += ca->free[j].size; ++ ++ dev_reserve += ca->nr_btree_reserve * 2; ++ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ + + dev_reserve += 1; /* btree write point */ + dev_reserve += 1; /* copygc write point */ @@ -1990,7 +2447,7 @@ index 000000000000..b2735c8591d6 + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list && -+ ob->ptr.dev == ca->dev_idx) ++ ob->dev == ca->dev_idx) + ret = true; + spin_unlock(&ob->lock); + } @@ -2003,8 +2460,6 @@ index 000000000000..b2735c8591d6 +{ + unsigned i; + -+ BUG_ON(ca->alloc_thread); -+ + /* First, remove device from allocation groups: */ + + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) @@ -2078,180 +2533,120 @@ index 000000000000..b2735c8591d6 + set_bit(ca->dev_idx, c->rw_devs[i].d); +} + -+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -+{ -+ if (ca->alloc_thread) -+ closure_wait_event(&c->freelist_wait, -+ ca->allocator_state != ALLOCATOR_running); -+} -+ -+/* stop allocator thread: */ -+void bch2_dev_allocator_stop(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ p = rcu_dereference_protected(ca->alloc_thread, 1); -+ ca->alloc_thread = NULL; -+ -+ /* -+ * We need an rcu barrier between setting ca->alloc_thread = NULL and -+ * the thread shutting down to avoid bch2_wake_allocator() racing: -+ * -+ * XXX: it would be better to have the rcu barrier be asynchronous -+ * instead of blocking us here -+ */ -+ synchronize_rcu(); -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+/* start allocator thread: */ -+int bch2_dev_allocator_start(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ /* -+ * allocator thread already started? -+ */ -+ if (ca->alloc_thread) -+ return 0; -+ -+ p = kthread_create(bch2_allocator_thread, ca, -+ "bch-alloc/%s", ca->name); -+ if (IS_ERR(p)) { -+ bch_err(ca->fs, "error creating allocator thread: %li", -+ PTR_ERR(p)); -+ return PTR_ERR(p); -+ } -+ -+ get_task_struct(p); -+ rcu_assign_pointer(ca->alloc_thread, p); -+ wake_up_process(p); -+ return 0; -+} -+ +void bch2_fs_allocator_background_init(struct bch_fs *c) +{ + spin_lock_init(&c->freelist_lock); -+} -+ -+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid && !ob->on_partial_list) { -+ pr_buf(out, "%zu ref %u type %s\n", -+ ob - c->open_buckets, -+ atomic_read(&ob->pin), -+ bch2_data_types[ob->type]); -+ } -+ spin_unlock(&ob->lock); -+ } -+ ++ INIT_WORK(&c->discard_work, bch2_do_discards_work); ++ INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 -index 000000000000..370573f8e05d +index 000000000000..da1b650e8017 --- /dev/null +++ b/fs/bcachefs/alloc_background.h -@@ -0,0 +1,143 @@ +@@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H +#define _BCACHEFS_ALLOC_BACKGROUND_H + +#include "bcachefs.h" +#include "alloc_types.h" ++#include "buckets.h" +#include "debug.h" -+ -+extern const char * const bch2_allocator_states[]; -+ -+struct bkey_alloc_unpacked { -+ u64 journal_seq; -+ u64 bucket; -+ u8 dev; -+ u8 gen; -+ u8 oldest_gen; -+ u8 data_type; -+#define x(_name, _bits) u##_bits _name; -+ BCH_ALLOC_FIELDS_V2() -+#undef x -+}; -+ -+struct bkey_alloc_buf { -+ struct bkey_i k; -+ struct bch_alloc_v3 v; -+ -+#define x(_name, _bits) + _bits / 8 -+ u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; -+#undef x -+} __attribute__((packed, aligned(8))); ++#include "super.h" + +/* How out of date a pointer gen is allowed to be: */ +#define BUCKET_GC_GEN_MAX 96U + -+/* returns true if not equal */ -+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, -+ struct bkey_alloc_unpacked r) ++static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) +{ -+ return l.gen != r.gen || -+ l.oldest_gen != r.oldest_gen || -+ l.data_type != r.data_type -+#define x(_name, ...) || l._name != r._name -+ BCH_ALLOC_FIELDS_V2() -+#undef x -+ ; ++ return a.gen - a.oldest_gen; +} + -+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *, -+ const struct bkey_alloc_unpacked); ++enum bucket_state { ++ BUCKET_free, ++ BUCKET_need_gc_gens, ++ BUCKET_need_discard, ++ BUCKET_cached, ++ BUCKET_dirty, ++}; ++ ++extern const char * const bch2_bucket_states[]; ++ ++static inline enum bucket_state bucket_state(struct bch_alloc_v4 a) ++{ ++ if (a.dirty_sectors || a.stripe) ++ return BUCKET_dirty; ++ if (a.cached_sectors) ++ return BUCKET_cached; ++ BUG_ON(a.data_type); ++ if (BCH_ALLOC_V4_NEED_DISCARD(&a)) ++ return BUCKET_need_discard; ++ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) ++ return BUCKET_need_gc_gens; ++ return BUCKET_free; ++} ++ ++static inline u64 alloc_lru_idx(struct bch_alloc_v4 a) ++{ ++ return bucket_state(a) == BUCKET_cached ? a.io_time[READ] : 0; ++} ++ ++static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) ++{ ++ return ((u64) alloc_gc_gen(a) >> 4) << 56; ++} ++ ++static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) ++{ ++ pos.offset |= alloc_freespace_genbits(a); ++ return pos; ++} ++ ++struct bkey_i_alloc_v4 * ++bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); ++ ++void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); ++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); + +int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + -+static inline struct bkey_alloc_unpacked -+alloc_mem_to_key(struct btree_iter *iter, -+ struct bucket *g, struct bucket_mark m) -+{ -+ return (struct bkey_alloc_unpacked) { -+ .dev = iter->pos.inode, -+ .bucket = iter->pos.offset, -+ .gen = m.gen, -+ .oldest_gen = g->oldest_gen, -+ .data_type = m.data_type, -+ .dirty_sectors = m.dirty_sectors, -+ .cached_sectors = m.cached_sectors, -+ .read_time = g->io_time[READ], -+ .write_time = g->io_time[WRITE], -+ }; -+} -+ +#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) + +const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c); ++const char *bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k); ++void bch2_alloc_v4_swab(struct bkey_s); +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_alloc (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ +} + +#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ +} + +#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v3_invalid, \ + .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ ++} ++ ++#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_v4_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++ .swab = bch2_alloc_v4_swab, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ +} + +static inline bool bkey_is_alloc(const struct bkey *k) @@ -2263,54 +2658,38 @@ index 000000000000..370573f8e05d + +int bch2_alloc_read(struct bch_fs *); + -+static inline void bch2_wake_allocator(struct bch_dev *ca) -+{ -+ struct task_struct *p; ++int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, ++ struct bkey_i *, unsigned); ++int bch2_check_alloc_info(struct bch_fs *, bool); ++void bch2_do_discards(struct bch_fs *); + -+ rcu_read_lock(); -+ p = rcu_dereference(ca->alloc_thread); -+ if (p) -+ wake_up_process(p); -+ rcu_read_unlock(); ++static inline bool should_invalidate_buckets(struct bch_dev *ca) ++{ ++ struct bch_dev_usage u = bch2_dev_usage_read(ca); ++ ++ return u.d[BCH_DATA_cached].buckets && ++ u.buckets_unavailable + u.d[BCH_DATA_cached].buckets < ++ ca->mi.nbuckets >> 7; +} + -+static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, -+ size_t bucket) -+{ -+ if (bch2_expensive_debug_checks) { -+ size_t iter; -+ long i; -+ unsigned j; ++void bch2_do_invalidates(struct bch_fs *); + -+ for (j = 0; j < RESERVE_NR; j++) -+ fifo_for_each_entry(i, &ca->free[j], iter) -+ BUG_ON(i == bucket); -+ fifo_for_each_entry(i, &ca->free_inc, iter) -+ BUG_ON(i == bucket); -+ } -+} ++int bch2_fs_freespace_init(struct bch_fs *); + +void bch2_recalc_capacity(struct bch_fs *); + +void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); + -+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); -+void bch2_dev_allocator_stop(struct bch_dev *); -+int bch2_dev_allocator_start(struct bch_dev *); -+ -+int bch2_alloc_write(struct bch_fs *, unsigned); +void bch2_fs_allocator_background_init(struct bch_fs *); + -+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); -+ +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 -index 000000000000..412fed479482 +index 000000000000..4dbab45be5ed --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,960 @@ +@@ -0,0 +1,1263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2012 Google, Inc. @@ -2327,19 +2706,31 @@ index 000000000000..412fed479482 +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "btree_update.h" +#include "btree_gc.h" +#include "buckets.h" ++#include "buckets_waiting_for_journal.h" +#include "clock.h" +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" ++#include "error.h" +#include "io.h" ++#include "journal.h" + +#include +#include +#include +#include + ++const char * const bch2_alloc_reserves[] = { ++#define x(t) #t, ++ BCH_ALLOC_RESERVES() ++#undef x ++ NULL ++}; ++ +/* + * Open buckets represent a bucket that's currently being allocated from. They + * serve two purposes: @@ -2356,9 +2747,32 @@ index 000000000000..412fed479482 + * reference _after_ doing the index update that makes its allocation reachable. + */ + ++static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) ++{ ++ open_bucket_idx_t idx = ob - c->open_buckets; ++ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); ++ ++ ob->hash = *slot; ++ *slot = idx; ++} ++ ++static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) ++{ ++ open_bucket_idx_t idx = ob - c->open_buckets; ++ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); ++ ++ while (*slot != idx) { ++ BUG_ON(!*slot); ++ slot = &c->open_buckets[*slot].hash; ++ } ++ ++ *slot = ob->hash; ++ ob->hash = 0; ++} ++ +void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (ob->ec) { + bch2_ec_bucket_written(c, ob); @@ -2368,14 +2782,15 @@ index 000000000000..412fed479482 + percpu_down_read(&c->mark_lock); + spin_lock(&ob->lock); + -+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false); + ob->valid = false; -+ ob->type = 0; ++ ob->data_type = 0; + + spin_unlock(&ob->lock); + percpu_up_read(&c->mark_lock); + + spin_lock(&c->freelist_lock); ++ bch2_open_bucket_hash_remove(c, ob); ++ + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + @@ -2394,8 +2809,7 @@ index 000000000000..412fed479482 + unsigned i; + + open_bucket_for_each(c, obs, ob, i) -+ if (ob->ptr.dev == dev && -+ ob->ec) ++ if (ob->dev == dev && ob->ec) + bch2_ec_bucket_cancel(c, ob); +} + @@ -2408,7 +2822,7 @@ index 000000000000..412fed479482 + ob = c->open_buckets + c->open_buckets_freelist; + c->open_buckets_freelist = ob->freelist; + atomic_set(&ob->pin, 1); -+ ob->type = 0; ++ ob->data_type = 0; + + c->open_buckets_nr_free--; + return ob; @@ -2418,8 +2832,8 @@ index 000000000000..412fed479482 + struct write_point *wp, + struct open_bucket *ob) +{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ bool may_realloc = wp->type == BCH_DATA_user; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ bool may_realloc = wp->data_type == BCH_DATA_user; + + BUG_ON(ca->open_buckets_partial_nr > + ARRAY_SIZE(ca->open_buckets_partial)); @@ -2440,85 +2854,62 @@ index 000000000000..412fed479482 + } +} + -+static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ BUG_ON(ptr_stale(ca, &ob->ptr)); -+ } -+#endif -+} -+ +/* _only_ for allocating the journal on a new device: */ +long bch2_bucket_alloc_new_fs(struct bch_dev *ca) +{ -+ struct bucket_array *buckets; -+ ssize_t b; ++ while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { ++ u64 b = ca->new_fs_bucket_idx++; + -+ rcu_read_lock(); -+ buckets = bucket_array(ca); ++ if (!is_superblock_bucket(ca, b) && ++ (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) ++ return b; ++ } + -+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) -+ if (is_available_bucket(buckets->b[b].mark) && -+ !buckets->b[b].mark.owned_by_allocator) -+ goto success; -+ b = -1; -+success: -+ rcu_read_unlock(); -+ return b; ++ return -1; +} + +static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) +{ + switch (reserve) { -+ case RESERVE_BTREE: -+ case RESERVE_BTREE_MOVINGGC: ++ case RESERVE_btree: ++ case RESERVE_btree_movinggc: + return 0; -+ case RESERVE_MOVINGGC: ++ case RESERVE_movinggc: + return OPEN_BUCKETS_COUNT / 4; + default: + return OPEN_BUCKETS_COUNT / 2; + } +} + -+/** -+ * bch_bucket_alloc - allocate a single bucket from a specific device -+ * -+ * Returns index of bucket on success, 0 on failure -+ * */ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, -+ enum alloc_reserve reserve, -+ bool may_alloc_partial, -+ struct closure *cl) ++static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ u64 bucket, ++ enum alloc_reserve reserve, ++ struct bch_alloc_v4 *a, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct closure *cl) +{ + struct open_bucket *ob; -+ long b = 0; ++ ++ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { ++ (*skipped_nouse)++; ++ return NULL; ++ } ++ ++ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { ++ (*skipped_open)++; ++ return NULL; ++ } ++ ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { ++ (*skipped_need_journal_commit)++; ++ return NULL; ++ } + + spin_lock(&c->freelist_lock); + -+ if (may_alloc_partial) { -+ int i; -+ -+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { -+ ob = c->open_buckets + ca->open_buckets_partial[i]; -+ -+ if (reserve <= ob->alloc_reserve) { -+ array_remove_item(ca->open_buckets_partial, -+ ca->open_buckets_partial_nr, -+ i); -+ ob->on_partial_list = false; -+ ob->alloc_reserve = reserve; -+ spin_unlock(&c->freelist_lock); -+ return ob; -+ } -+ } -+ } -+ + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); @@ -2527,36 +2918,18 @@ index 000000000000..412fed479482 + c->blocked_allocate_open_bucket = local_clock(); + + spin_unlock(&c->freelist_lock); -+ trace_open_bucket_alloc_fail(ca, reserve); ++ ++ trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]); + return ERR_PTR(-OPEN_BUCKETS_EMPTY); + } + -+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) -+ goto out; -+ -+ switch (reserve) { -+ case RESERVE_BTREE_MOVINGGC: -+ case RESERVE_MOVINGGC: -+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) -+ goto out; -+ break; -+ default: -+ break; ++ /* Recheck under lock: */ ++ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { ++ spin_unlock(&c->freelist_lock); ++ (*skipped_open)++; ++ return NULL; + } + -+ if (cl) -+ closure_wait(&c->freelist_wait, cl); -+ -+ if (!c->blocked_allocate) -+ c->blocked_allocate = local_clock(); -+ -+ spin_unlock(&c->freelist_lock); -+ -+ trace_bucket_alloc_fail(ca, reserve); -+ return ERR_PTR(-FREELIST_EMPTY); -+out: -+ verify_not_on_freelist(c, ca, b); -+ + ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); @@ -2564,15 +2937,14 @@ index 000000000000..412fed479482 + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->alloc_reserve = reserve; -+ ob->ptr = (struct bch_extent_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_ptr, -+ .gen = bucket(ca, b)->mark.gen, -+ .offset = bucket_to_sector(ca, b), -+ .dev = ca->dev_idx, -+ }; -+ ++ ob->dev = ca->dev_idx; ++ ob->gen = a->gen; ++ ob->bucket = bucket; + spin_unlock(&ob->lock); + ++ ca->nr_open_buckets++; ++ bch2_open_bucket_hash_add(c, ob); ++ + if (c->blocked_allocate_open_bucket) { + bch2_time_stats_update( + &c->times[BCH_TIME_blocked_allocate_open_bucket], @@ -2587,12 +2959,285 @@ index 000000000000..412fed479482 + c->blocked_allocate = 0; + } + -+ ca->nr_open_buckets++; + spin_unlock(&c->freelist_lock); + -+ bch2_wake_allocator(ca); ++ trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]); ++ return ob; ++} ++ ++static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, ++ enum alloc_reserve reserve, u64 free_entry, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct open_bucket *ob; ++ struct bch_alloc_v4 a; ++ u64 b = free_entry & ~(~0ULL << 56); ++ unsigned genbits = free_entry >> 56; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) { ++ ob = ERR_PTR(ret); ++ goto err; ++ } ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c, ++ "non free bucket in freespace btree (state %s)\n" ++ " %s\n" ++ " at %llu (genbits %u)", ++ bch2_bucket_states[bucket_state(a)], ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ free_entry, genbits)) { ++ ob = ERR_PTR(-EIO); ++ goto err; ++ } ++ ++ if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c, ++ "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" ++ " %s", ++ genbits, alloc_freespace_genbits(a) >> 56, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ob = ERR_PTR(-EIO); ++ goto err; ++ } ++ ++ if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c, ++ "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)", ++ b, ca->mi.first_bucket, ca->mi.nbuckets)) { ++ ob = ERR_PTR(-EIO); ++ goto err; ++ } ++ ++ ob = __try_alloc_bucket(c, ca, b, reserve, &a, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ob; ++} ++ ++static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve) ++{ ++ struct open_bucket *ob; ++ int i; ++ ++ spin_lock(&c->freelist_lock); ++ ++ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { ++ ob = c->open_buckets + ca->open_buckets_partial[i]; ++ ++ if (reserve <= ob->alloc_reserve) { ++ array_remove_item(ca->open_buckets_partial, ++ ca->open_buckets_partial_nr, ++ i); ++ ob->on_partial_list = false; ++ ob->alloc_reserve = reserve; ++ spin_unlock(&c->freelist_lock); ++ return ob; ++ } ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ return NULL; ++} ++ ++/* ++ * This path is for before the freespace btree is initialized: ++ * ++ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & ++ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx ++ */ ++static noinline struct open_bucket * ++bch2_bucket_alloc_trans_early(struct btree_trans *trans, ++ struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ u64 *cur_bucket, ++ u64 *buckets_seen, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct closure *cl) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct open_bucket *ob = NULL; ++ int ret; ++ ++ *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket); ++ *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx); ++ ++ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), ++ BTREE_ITER_SLOTS, k, ret) { ++ struct bch_alloc_v4 a; ++ ++ if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) ++ break; ++ ++ if (ca->new_fs_bucket_idx && ++ is_superblock_bucket(ca, k.k->p.offset)) ++ continue; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (bucket_state(a) != BUCKET_free) ++ continue; ++ ++ (*buckets_seen)++; ++ ++ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); ++ if (ob) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ *cur_bucket = iter.pos.offset; ++ ++ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); ++} ++ ++static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, ++ struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ u64 *cur_bucket, ++ u64 *buckets_seen, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct closure *cl) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct open_bucket *ob = NULL; ++ int ret; ++ ++ if (unlikely(!ca->mi.freespace_initialized)) ++ return bch2_bucket_alloc_trans_early(trans, ca, reserve, ++ cur_bucket, ++ buckets_seen, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); ++ ++ BUG_ON(ca->new_fs_bucket_idx); ++ ++ for_each_btree_key(trans, iter, BTREE_ID_freespace, ++ POS(ca->dev_idx, *cur_bucket), 0, k, ret) { ++ if (k.k->p.inode != ca->dev_idx) ++ break; ++ ++ for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k)); ++ *cur_bucket != k.k->p.offset && !ob; ++ (*cur_bucket)++) { ++ if (btree_trans_too_many_iters(trans)) { ++ ob = ERR_PTR(-EINTR); ++ break; ++ } ++ ++ (*buckets_seen)++; ++ ++ ob = try_alloc_bucket(trans, ca, reserve, ++ *cur_bucket, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); ++ } ++ if (ob) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ob ?: ERR_PTR(ret); ++} ++ ++/** ++ * bch_bucket_alloc - allocate a single bucket from a specific device ++ * ++ * Returns index of bucket on success, 0 on failure ++ * */ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct open_bucket *ob = NULL; ++ u64 avail = dev_buckets_available(ca, reserve); ++ u64 cur_bucket = 0; ++ u64 buckets_seen = 0; ++ u64 skipped_open = 0; ++ u64 skipped_need_journal_commit = 0; ++ u64 skipped_nouse = 0; ++ int ret; ++ ++ if (may_alloc_partial) { ++ ob = try_alloc_partial_bucket(c, ca, reserve); ++ if (ob) ++ return ob; ++ } ++again: ++ if (!avail) { ++ if (cl) { ++ closure_wait(&c->freelist_wait, cl); ++ /* recheck after putting ourself on waitlist */ ++ avail = dev_buckets_available(ca, reserve); ++ if (avail) { ++ closure_wake_up(&c->freelist_wait); ++ goto again; ++ } ++ } ++ ++ if (!c->blocked_allocate) ++ c->blocked_allocate = local_clock(); ++ ++ ob = ERR_PTR(-FREELIST_EMPTY); ++ goto err; ++ } ++ ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, ++ &cur_bucket, ++ &buckets_seen, ++ &skipped_open, ++ &skipped_need_journal_commit, ++ &skipped_nouse, ++ cl))); ++ ++ if (skipped_need_journal_commit * 2 > avail) ++ bch2_journal_flush_async(&c->journal, NULL); ++err: ++ if (!ob) ++ ob = ERR_PTR(ret ?: -FREELIST_EMPTY); ++ ++ if (IS_ERR(ob)) { ++ trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail, ++ buckets_seen, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl == NULL, PTR_ERR(ob)); ++ atomic_long_inc(&c->bucket_alloc_fail); ++ } + -+ trace_bucket_alloc(ca, reserve); + return ob; +} + @@ -2623,7 +3268,7 @@ index 000000000000..412fed479482 + struct dev_stripe_state *stripe) +{ + u64 *v = stripe->next_alloc + ca->dev_idx; -+ u64 free_space = dev_buckets_available(ca); ++ u64 free_space = dev_buckets_available(ca, RESERVE_none); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; @@ -2651,9 +3296,9 @@ index 000000000000..412fed479482 + struct open_bucket *ob) +{ + unsigned durability = -+ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; ++ bch_dev_bkey_exists(c, ob->dev)->mi.durability; + -+ __clear_bit(ob->ptr.dev, devs_may_alloc->d); ++ __clear_bit(ob->dev, devs_may_alloc->d); + *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) + ? durability : 1; + *have_cache |= !durability; @@ -2661,8 +3306,7 @@ index 000000000000..412fed479482 + ob_push(c, ptrs, ob); +} + -+enum bucket_alloc_ret -+bch2_bucket_alloc_set(struct bch_fs *c, ++int bch2_bucket_alloc_set(struct bch_fs *c, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, @@ -2675,8 +3319,9 @@ index 000000000000..412fed479482 +{ + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); ++ unsigned dev; + struct bch_dev *ca; -+ enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; ++ int ret = -INSUFFICIENT_DEVICES; + unsigned i; + + BUG_ON(*nr_effective >= nr_replicas); @@ -2684,30 +3329,43 @@ index 000000000000..412fed479482 + for (i = 0; i < devs_sorted.nr; i++) { + struct open_bucket *ob; + -+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ dev = devs_sorted.devs[i]; ++ ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ + if (!ca) + continue; + -+ if (!ca->mi.durability && *have_cache) ++ if (!ca->mi.durability && *have_cache) { ++ percpu_ref_put(&ca->ref); + continue; ++ } + + ob = bch2_bucket_alloc(c, ca, reserve, + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); ++ if (!IS_ERR(ob)) ++ bch2_dev_stripe_increment(ca, stripe); ++ percpu_ref_put(&ca->ref); ++ + if (IS_ERR(ob)) { -+ ret = -PTR_ERR(ob); ++ ret = PTR_ERR(ob); + + if (cl) -+ return ret; ++ break; + continue; + } + + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, flags, ob); + -+ bch2_dev_stripe_increment(ca, stripe); -+ -+ if (*nr_effective >= nr_replicas) -+ return ALLOC_SUCCESS; ++ if (*nr_effective >= nr_replicas) { ++ ret = 0; ++ break; ++ } + } + + return ret; @@ -2721,8 +3379,7 @@ index 000000000000..412fed479482 + * it's to a device we don't want: + */ + -+static enum bucket_alloc_ret -+bucket_alloc_from_stripe(struct bch_fs *c, ++static int bucket_alloc_from_stripe(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, @@ -2765,13 +3422,13 @@ index 000000000000..412fed479482 + continue; + + ob = c->open_buckets + h->s->blocks[ec_idx]; -+ if (ob->ptr.dev == devs_sorted.devs[i] && ++ if (ob->dev == devs_sorted.devs[i] && + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; + } + goto out_put_head; +got_bucket: -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ca = bch_dev_bkey_exists(c, ob->dev); + + ob->ec_idx = ec_idx; + ob->ec = h->s; @@ -2801,12 +3458,12 @@ index 000000000000..412fed479482 + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (*nr_effective < nr_replicas && -+ test_bit(ob->ptr.dev, devs_may_alloc->d) && ++ test_bit(ob->dev, devs_may_alloc->d) && + (ca->mi.durability || -+ (wp->type == BCH_DATA_user && !*have_cache)) && ++ (wp->data_type == BCH_DATA_user && !*have_cache)) && + (ob->ec || !need_ec)) { + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, @@ -2818,8 +3475,7 @@ index 000000000000..412fed479482 + wp->ptrs = ptrs_skip; +} + -+static enum bucket_alloc_ret -+open_bucket_add_buckets(struct bch_fs *c, ++static int open_bucket_add_buckets(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, @@ -2835,11 +3491,11 @@ index 000000000000..412fed479482 + struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; -+ enum bucket_alloc_ret ret; ++ int ret; + unsigned i; + + rcu_read_lock(); -+ devs = target_rw_devs(c, wp->type, target); ++ devs = target_rw_devs(c, wp->data_type, target); + rcu_read_unlock(); + + /* Don't allocate from devices we already have pointers to: */ @@ -2847,7 +3503,7 @@ index 000000000000..412fed479482 + __clear_bit(devs_have->devs[i], devs.d); + + open_bucket_for_each(c, ptrs, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); ++ __clear_bit(ob->dev, devs.d); + + if (erasure_code) { + if (!ec_open_bucket(c, ptrs)) { @@ -2863,8 +3519,8 @@ index 000000000000..412fed479482 + target, erasure_code, + nr_replicas, nr_effective, + have_cache, flags, _cl); -+ if (ret == FREELIST_EMPTY || -+ ret == OPEN_BUCKETS_EMPTY) ++ if (ret == -FREELIST_EMPTY || ++ ret == -OPEN_BUCKETS_EMPTY) + return ret; + if (*nr_effective >= nr_replicas) + return 0; @@ -2877,9 +3533,6 @@ index 000000000000..412fed479482 + if (*nr_effective >= nr_replicas) + return 0; + -+ percpu_down_read(&c->mark_lock); -+ rcu_read_lock(); -+ +retry_blocking: + /* + * Try nonblocking first, so that if one device is full we'll try from @@ -2888,14 +3541,11 @@ index 000000000000..412fed479482 + ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + reserve, flags, cl); -+ if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { ++ if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) { + cl = _cl; + goto retry_blocking; + } + -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ + return ret; +} + @@ -2907,7 +3557,7 @@ index 000000000000..412fed479482 + unsigned i, j; + + open_bucket_for_each(c, obs, ob, i) { -+ bool drop = !ca || ob->ptr.dev == ca->dev_idx; ++ bool drop = !ca || ob->dev == ca->dev_idx; + + if (!drop && ob->ec) { + mutex_lock(&ob->ec->lock); @@ -2916,7 +3566,7 @@ index 000000000000..412fed479482 + continue; + + ob2 = c->open_buckets + ob->ec->blocks[j]; -+ drop |= ob2->ptr.dev == ca->dev_idx; ++ drop |= ob2->dev == ca->dev_idx; + } + mutex_unlock(&ob->ec->lock); + } @@ -3085,7 +3735,7 @@ index 000000000000..412fed479482 + unsigned nr_effective, write_points_nr; + unsigned ob_flags = 0; + bool have_cache; -+ enum bucket_alloc_ret ret; ++ int ret; + int i; + + if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) @@ -3100,11 +3750,11 @@ index 000000000000..412fed479482 + + wp = writepoint_find(c, write_point.v); + -+ if (wp->type == BCH_DATA_user) ++ if (wp->data_type == BCH_DATA_user) + ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; + + /* metadata may not allocate on cache devices: */ -+ if (wp->type != BCH_DATA_user) ++ if (wp->data_type != BCH_DATA_user) + have_cache = true; + + if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { @@ -3134,7 +3784,7 @@ index 000000000000..412fed479482 + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + -+ if (ret == INSUFFICIENT_DEVICES && ++ if (ret == -INSUFFICIENT_DEVICES && + nr_effective >= nr_replicas_required) + ret = 0; + @@ -3154,8 +3804,6 @@ index 000000000000..412fed479482 + + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + -+ verify_not_stale(c, &wp->ptrs); -+ + return wp; +err: + open_bucket_for_each(c, &wp->ptrs, ob, i) @@ -3167,27 +3815,42 @@ index 000000000000..412fed479482 + + mutex_unlock(&wp->lock); + -+ if (ret == FREELIST_EMPTY && ++ if (ret == -FREELIST_EMPTY && + try_decrease_writepoints(c, write_points_nr)) + goto retry; + + switch (ret) { -+ case OPEN_BUCKETS_EMPTY: -+ case FREELIST_EMPTY: ++ case -OPEN_BUCKETS_EMPTY: ++ case -FREELIST_EMPTY: + return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); -+ case INSUFFICIENT_DEVICES: ++ case -INSUFFICIENT_DEVICES: + return ERR_PTR(-EROFS); + default: -+ BUG(); ++ return ERR_PTR(ret); + } +} + ++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ ++ return (struct bch_extent_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_ptr, ++ .gen = ob->gen, ++ .dev = ob->dev, ++ .offset = bucket_to_sector(ca, ob->bucket) + ++ ca->mi.bucket_size - ++ ob->sectors_free, ++ }; ++} ++ +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, -+ struct bkey_i *k, unsigned sectors) ++ struct bkey_i *k, unsigned sectors, ++ bool cached) + +{ + struct open_bucket *ob; @@ -3197,14 +3860,14 @@ index 000000000000..412fed479482 + wp->sectors_free -= sectors; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ struct bch_extent_ptr tmp = ob->ptr; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); + -+ tmp.cached = !ca->mi.durability && -+ wp->type == BCH_DATA_user; ++ ptr.cached = cached || ++ (!ca->mi.durability && ++ wp->data_type == BCH_DATA_user); + -+ tmp.offset += ca->mi.bucket_size - ob->sectors_free; -+ bch2_bkey_append_ptr(k, tmp); ++ bch2_bkey_append_ptr(k, ptr); + + BUG_ON(sectors > ob->sectors_free); + ob->sectors_free -= sectors; @@ -3234,7 +3897,7 @@ index 000000000000..412fed479482 + enum bch_data_type type) +{ + mutex_init(&wp->lock); -+ wp->type = type; ++ wp->data_type = type; +} + +void bch2_fs_allocator_foreground_init(struct bch_fs *c) @@ -3271,12 +3934,31 @@ index 000000000000..412fed479482 + writepoint_hash(c, wp->write_point)); + } +} ++ ++void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list) { ++ pr_buf(out, "%zu ref %u type %s\n", ++ ob - c->open_buckets, ++ atomic_read(&ob->pin), ++ bch2_data_types[ob->data_type]); ++ } ++ spin_unlock(&ob->lock); ++ } ++ ++} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h new file mode 100644 -index 000000000000..c658295cb8e0 +index 000000000000..8bc78877f0fc --- /dev/null +++ b/fs/bcachefs/alloc_foreground.h -@@ -0,0 +1,138 @@ +@@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H +#define _BCACHEFS_ALLOC_FOREGROUND_H @@ -3291,12 +3973,7 @@ index 000000000000..c658295cb8e0 +struct bch_fs; +struct bch_devs_List; + -+enum bucket_alloc_ret { -+ ALLOC_SUCCESS, -+ OPEN_BUCKETS_EMPTY, -+ FREELIST_EMPTY, /* Allocator thread not keeping up */ -+ INSUFFICIENT_DEVICES, -+}; ++extern const char * const bch2_alloc_reserves[]; + +struct dev_alloc_list { + unsigned nr; @@ -3371,14 +4048,51 @@ index 000000000000..c658295cb8e0 + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ ob->type = wp->type; ++ ob->data_type = wp->data_type; + atomic_inc(&ob->pin); + ob_push(c, ptrs, ob); + } +} + -+enum bucket_alloc_ret -+bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, ++static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, ++ unsigned dev, u64 bucket) ++{ ++ return c->open_buckets_hash + ++ (jhash_3words(dev, bucket, bucket >> 32, 0) & ++ (OPEN_BUCKETS_COUNT - 1)); ++} ++ ++static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) ++{ ++ open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); ++ ++ while (slot) { ++ struct open_bucket *ob = &c->open_buckets[slot]; ++ ++ if (ob->dev == dev && ob->bucket == bucket) ++ return true; ++ ++ slot = ob->hash; ++ } ++ ++ return false; ++} ++ ++static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) ++{ ++ bool ret; ++ ++ if (bch2_bucket_is_open(c, dev, bucket)) ++ return true; ++ ++ spin_lock(&c->freelist_lock); ++ ret = bch2_bucket_is_open(c, dev, bucket); ++ spin_unlock(&c->freelist_lock); ++ ++ return ret; ++} ++ ++int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, + struct dev_stripe_state *, struct bch_devs_mask *, + unsigned, unsigned *, bool *, enum alloc_reserve, + unsigned, struct closure *); @@ -3392,8 +4106,9 @@ index 000000000000..c658295cb8e0 + unsigned, + struct closure *); + ++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, -+ struct bkey_i *, unsigned); ++ struct bkey_i *, unsigned, bool); +void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); + +void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, @@ -3414,13 +4129,15 @@ index 000000000000..c658295cb8e0 + +void bch2_fs_allocator_foreground_init(struct bch_fs *); + ++void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); ++ +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h new file mode 100644 -index 000000000000..4a1cd8b73d16 +index 000000000000..21b56451bc18 --- /dev/null +++ b/fs/bcachefs/alloc_types.h -@@ -0,0 +1,98 @@ +@@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_TYPES_H +#define _BCACHEFS_ALLOC_TYPES_H @@ -3433,51 +4150,48 @@ index 000000000000..4a1cd8b73d16 + +struct ec_bucket_buf; + -+#define ALLOC_THREAD_STATES() \ -+ x(stopped) \ -+ x(running) \ -+ x(blocked) \ -+ x(blocked_full) -+ -+enum allocator_states { -+#define x(n) ALLOCATOR_##n, -+ ALLOC_THREAD_STATES() -+#undef x -+}; ++#define BCH_ALLOC_RESERVES() \ ++ x(btree_movinggc) \ ++ x(btree) \ ++ x(movinggc) \ ++ x(none) + +enum alloc_reserve { -+ RESERVE_BTREE_MOVINGGC = -2, -+ RESERVE_BTREE = -1, -+ RESERVE_MOVINGGC = 0, -+ RESERVE_NONE = 1, -+ RESERVE_NR = 2, ++#define x(name) RESERVE_##name, ++ BCH_ALLOC_RESERVES() ++#undef x +}; + -+typedef FIFO(long) alloc_fifo; -+ +#define OPEN_BUCKETS_COUNT 1024 + +#define WRITE_POINT_HASH_NR 32 +#define WRITE_POINT_MAX 32 + ++/* ++ * 0 is never a valid open_bucket_idx_t: ++ */ +typedef u16 open_bucket_idx_t; + +struct open_bucket { + spinlock_t lock; + atomic_t pin; + open_bucket_idx_t freelist; ++ open_bucket_idx_t hash; + + /* + * When an open bucket has an ec_stripe attached, this is the index of + * the block in the stripe this open_bucket corresponds to: + */ + u8 ec_idx; -+ u8 type; ++ enum bch_data_type data_type:3; + unsigned valid:1; + unsigned on_partial_list:1; + int alloc_reserve:3; ++ + unsigned sectors_free; -+ struct bch_extent_ptr ptr; ++ u8 dev; ++ u8 gen; ++ u64 bucket; + struct ec_stripe_new *ec; +}; + @@ -3497,7 +4211,7 @@ index 000000000000..4a1cd8b73d16 + struct mutex lock; + u64 last_used; + unsigned long write_point; -+ enum bch_data_type type; ++ enum bch_data_type data_type; + + /* calculated based on how many pointers we're actually going to use: */ + unsigned sectors_free; @@ -3510,21 +4224,13 @@ index 000000000000..4a1cd8b73d16 + unsigned long v; +}; + -+struct alloc_heap_entry { -+ size_t bucket; -+ size_t nr; -+ unsigned long key; -+}; -+ -+typedef HEAP(struct alloc_heap_entry) alloc_heap; -+ +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 -index 000000000000..fdf3a777ae16 +index 000000000000..a13845a23387 --- /dev/null +++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,958 @@ +@@ -0,0 +1,974 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H @@ -3704,7 +4410,11 @@ index 000000000000..fdf3a777ae16 + */ + +#undef pr_fmt ++#ifdef __KERNEL__ +#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ ++#else ++#define pr_fmt(fmt) "%s() " fmt "\n", __func__ ++#endif + +#include +#include @@ -3727,6 +4437,7 @@ index 000000000000..fdf3a777ae16 +#include + +#include "bcachefs_format.h" ++#include "errcode.h" +#include "fifo.h" +#include "opts.h" +#include "util.h" @@ -3745,8 +4456,8 @@ index 000000000000..fdf3a777ae16 +#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) +#else -+#define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name) -+#define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum) ++#define bch2_fmt(_c, fmt) fmt "\n" ++#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) +#endif + +#define bch_info(c, fmt, ...) \ @@ -3803,9 +4514,6 @@ index 000000000000..fdf3a777ae16 + "significantly affect performance") \ + BCH_DEBUG_PARAM(debug_check_iterators, \ + "Enables extra verification for btree iterators") \ -+ BCH_DEBUG_PARAM(debug_check_bkeys, \ -+ "Run bkey_debugcheck (primarily checking GC/allocation "\ -+ "information) when iterating over keys") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(journal_seq_verify, \ @@ -3847,8 +4555,12 @@ index 000000000000..fdf3a777ae16 +#define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ + x(btree_node_split) \ ++ x(btree_node_compact) \ ++ x(btree_node_merge) \ + x(btree_node_sort) \ + x(btree_node_read) \ ++ x(btree_interior_update_foreground) \ ++ x(btree_interior_update_total) \ + x(btree_gc) \ + x(btree_lock_contended_read) \ + x(btree_lock_contended_intent) \ @@ -3856,8 +4568,8 @@ index 000000000000..fdf3a777ae16 + x(data_write) \ + x(data_read) \ + x(data_promote) \ -+ x(journal_write) \ -+ x(journal_delay) \ ++ x(journal_flush_write) \ ++ x(journal_noflush_write) \ + x(journal_flush_seq) \ + x(blocked_journal) \ + x(blocked_allocate) \ @@ -3873,6 +4585,7 @@ index 000000000000..fdf3a777ae16 +#include "alloc_types.h" +#include "btree_types.h" +#include "buckets_types.h" ++#include "buckets_waiting_for_journal_types.h" +#include "clock_types.h" +#include "ec_types.h" +#include "journal_types.h" @@ -3911,6 +4624,10 @@ index 000000000000..fdf3a777ae16 + GC_PHASE_BTREE_reflink, + GC_PHASE_BTREE_subvolumes, + GC_PHASE_BTREE_snapshots, ++ GC_PHASE_BTREE_lru, ++ GC_PHASE_BTREE_freespace, ++ GC_PHASE_BTREE_need_discard, ++ GC_PHASE_BTREE_backpointers, + + GC_PHASE_PENDING_DELETE, +}; @@ -3954,6 +4671,7 @@ index 000000000000..fdf3a777ae16 + struct bch_sb_handle disk_sb; + struct bch_sb *sb_read_scratch; + int sb_write_error; ++ dev_t dev; + + struct bch_devs_mask self; + @@ -3966,7 +4684,9 @@ index 000000000000..fdf3a777ae16 + * gc_lock, for device resize - holding any is sufficient for access: + * Or rcu_read_lock(), but only for ptr_stale(): + */ -+ struct bucket_array __rcu *buckets[2]; ++ struct bucket_array __rcu *buckets_gc; ++ struct bucket_gens __rcu *bucket_gens; ++ u8 *oldest_gen; + unsigned long *buckets_nouse; + struct rw_semaphore bucket_lock; + @@ -3975,32 +4695,17 @@ index 000000000000..fdf3a777ae16 + struct bch_dev_usage __percpu *usage_gc; + + /* Allocator: */ -+ struct task_struct __rcu *alloc_thread; ++ u64 new_fs_bucket_idx; + -+ /* -+ * free: Buckets that are ready to be used -+ * -+ * free_inc: Incoming buckets - these are buckets that currently have -+ * cached data in them, and we can't reuse them until after we write -+ * their new gen to disk. After prio_write() finishes writing the new -+ * gens/prios, they'll be moved to the free list (and possibly discarded -+ * in the process) -+ */ -+ alloc_fifo free[RESERVE_NR]; -+ alloc_fifo free_inc; + unsigned nr_open_buckets; ++ unsigned nr_btree_reserve; + + open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial_nr; + -+ size_t fifo_last_bucket; -+ + size_t inc_gen_needs_gc; + size_t inc_gen_really_needs_gc; -+ -+ enum allocator_states allocator_state; -+ -+ alloc_heap alloc_heap; ++ size_t buckets_waiting_on_journal; + + atomic64_t rebalance_work; + @@ -4022,17 +4727,13 @@ index 000000000000..fdf3a777ae16 + +enum { + /* startup: */ -+ BCH_FS_INITIALIZED, -+ BCH_FS_ALLOC_READ_DONE, + BCH_FS_ALLOC_CLEAN, -+ BCH_FS_ALLOCATOR_RUNNING, -+ BCH_FS_ALLOCATOR_STOPPING, + BCH_FS_INITIAL_GC_DONE, + BCH_FS_INITIAL_GC_UNFIXED, + BCH_FS_TOPOLOGY_REPAIR_DONE, -+ BCH_FS_BTREE_INTERIOR_REPLAY_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_STARTED, ++ BCH_FS_MAY_GO_RW, + BCH_FS_RW, + BCH_FS_WAS_RW, + @@ -4050,16 +4751,11 @@ index 000000000000..fdf3a777ae16 + /* misc: */ + BCH_FS_NEED_ANOTHER_GC, + BCH_FS_DELETED_NODES, -+ BCH_FS_NEED_ALLOC_WRITE, + BCH_FS_REBUILD_REPLICAS, -+ BCH_FS_HOLD_BTREE_WRITES, +}; + +struct btree_debug { + unsigned id; -+ struct dentry *btree; -+ struct dentry *btree_format; -+ struct dentry *failed; +}; + +struct bch_fs_pcpu { @@ -4080,6 +4776,7 @@ index 000000000000..fdf3a777ae16 + enum btree_id btree_id:8; + unsigned level:8; + bool allocated; ++ bool overwritten; + struct bkey_i *k; + u32 journal_seq; + u32 journal_offset; @@ -4156,7 +4853,6 @@ index 000000000000..fdf3a777ae16 + + u16 version; + u16 version_min; -+ u16 encoded_extent_max; + + u8 nr_devices; + u8 clean; @@ -4187,7 +4883,7 @@ index 000000000000..fdf3a777ae16 + struct mutex snapshot_table_lock; + struct work_struct snapshot_delete_work; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; -+ struct snapshot_id_list snapshots_unlinked; ++ snapshot_id_list snapshots_unlinked; + struct mutex snapshots_unlinked_lock; + + /* BTREE CACHE */ @@ -4227,8 +4923,10 @@ index 000000000000..fdf3a777ae16 + struct btree_path_buf __percpu *btree_paths_bufs; + + struct srcu_struct btree_trans_barrier; ++ bool btree_trans_barrier_initialized; + + struct btree_key_cache btree_key_cache; ++ unsigned btree_key_cache_btrees; + + struct workqueue_struct *btree_update_wq; + struct workqueue_struct *btree_io_complete_wq; @@ -4277,10 +4975,12 @@ index 000000000000..fdf3a777ae16 + struct closure_waitlist freelist_wait; + u64 blocked_allocate; + u64 blocked_allocate_open_bucket; ++ + open_bucket_idx_t open_buckets_freelist; + open_bucket_idx_t open_buckets_nr_free; + struct closure_waitlist open_buckets_wait; + struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; ++ open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; + + struct write_point btree_write_point; + struct write_point rebalance_write_point; @@ -4290,6 +4990,10 @@ index 000000000000..fdf3a777ae16 + struct mutex write_points_hash_lock; + unsigned write_points_nr; + ++ struct buckets_waiting_for_journal buckets_waiting_for_journal; ++ struct work_struct discard_work; ++ struct work_struct invalidate_work; ++ + /* GARBAGE COLLECTION */ + struct task_struct *gc_thread; + atomic_t kick_gc; @@ -4315,6 +5019,7 @@ index 000000000000..fdf3a777ae16 + * it's not while a gc is in progress. + */ + struct rw_semaphore gc_lock; ++ struct mutex gc_gens_lock; + + /* IO PATH */ + struct semaphore io_in_flight; @@ -4352,7 +5057,8 @@ index 000000000000..fdf3a777ae16 + struct mutex data_progress_lock; + + /* STRIPES: */ -+ GENRADIX(struct stripe) stripes[2]; ++ GENRADIX(struct stripe) stripes; ++ GENRADIX(struct gc_stripe) gc_stripes; + + ec_stripes_heap ec_stripes_heap; + spinlock_t ec_stripes_heap_lock; @@ -4376,7 +5082,6 @@ index 000000000000..fdf3a777ae16 + u64 reflink_hint; + reflink_gc_table reflink_gc_table; + size_t reflink_gc_nr; -+ size_t reflink_gc_idx; + + /* VFS IO PATH - fs-io.c */ + struct bio_set writepage_bioset; @@ -4397,7 +5102,8 @@ index 000000000000..fdf3a777ae16 + struct bch_memquota_type quotas[QTYP_NR]; + + /* DEBUG JUNK */ -+ struct dentry *debug; ++ struct dentry *fs_debug_dir; ++ struct dentry *btree_debug_dir; + struct btree_debug btree_debug[BTREE_ID_NR]; + struct btree *verify_data; + struct btree_node *verify_ondisk; @@ -4425,6 +5131,7 @@ index 000000000000..fdf3a777ae16 + atomic_long_t read_realloc_races; + atomic_long_t extent_migrate_done; + atomic_long_t extent_migrate_raced; ++ atomic_long_t bucket_alloc_fail; + + unsigned btree_gc_periodic:1; + unsigned copy_gc_enabled:1; @@ -4448,10 +5155,25 @@ index 000000000000..fdf3a777ae16 + +static inline unsigned block_bytes(const struct bch_fs *c) +{ -+ return c->opts.block_size << 9; ++ return c->opts.block_size; +} + -+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time) ++static inline unsigned block_sectors(const struct bch_fs *c) ++{ ++ return c->opts.block_size >> 9; ++} ++ ++static inline size_t btree_sectors(const struct bch_fs *c) ++{ ++ return c->opts.btree_node_size >> 9; ++} ++ ++static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) ++{ ++ return c->btree_key_cache_btrees & (1U << btree); ++} ++ ++static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) +{ + struct timespec64 t; + s32 rem; @@ -4463,13 +5185,13 @@ index 000000000000..fdf3a777ae16 + return t; +} + -+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) ++static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) +{ + return (ts.tv_sec * c->sb.time_units_per_sec + + (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; +} + -+static inline s64 bch2_current_time(struct bch_fs *c) ++static inline s64 bch2_current_time(const struct bch_fs *c) +{ + struct timespec64 now; + @@ -4485,10 +5207,10 @@ index 000000000000..fdf3a777ae16 +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 -index 000000000000..b115bd1fa5a3 +index 000000000000..8312018e1ed5 --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,1893 @@ +@@ -0,0 +1,1986 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H @@ -4567,6 +5289,22 @@ index 000000000000..b115bd1fa5a3 +#include +#include +#include ++#include "vstructs.h" ++ ++#define BITMASK(name, type, field, offset, end) \ ++static const unsigned name##_OFFSET = offset; \ ++static const unsigned name##_BITS = (end - offset); \ ++ \ ++static inline __u64 name(const type *k) \ ++{ \ ++ return (k->field >> offset) & ~(~0ULL << (end - offset)); \ ++} \ ++ \ ++static inline void SET_##name(type *k, __u64 v) \ ++{ \ ++ k->field &= ~(~(~0ULL << (end - offset)) << offset); \ ++ k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ ++} + +#define LE_BITMASK(_bits, name, type, field, offset, end) \ +static const unsigned name##_OFFSET = offset; \ @@ -4837,7 +5575,10 @@ index 000000000000..b115bd1fa5a3 + x(subvolume, 21) \ + x(snapshot, 22) \ + x(inode_v2, 23) \ -+ x(alloc_v3, 24) ++ x(alloc_v3, 24) \ ++ x(set, 25) \ ++ x(lru, 26) \ ++ x(alloc_v4, 27) + +enum bch_bkey_type { +#define x(name, nr) KEY_TYPE_##name = nr, @@ -4867,6 +5608,10 @@ index 000000000000..b115bd1fa5a3 + struct bch_val v; +}; + ++struct bch_set { ++ struct bch_val v; ++}; ++ +/* Extents */ + +/* @@ -5367,8 +6112,8 @@ index 000000000000..b115bd1fa5a3 +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ -+ x(dirty_sectors, 16) \ -+ x(cached_sectors, 16) \ ++ x(dirty_sectors, 32) \ ++ x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + @@ -5383,11 +6128,34 @@ index 000000000000..b115bd1fa5a3 + __u8 data[]; +} __attribute__((packed, aligned(8))); + ++struct bch_alloc_v4 { ++ struct bch_val v; ++ __u64 journal_seq; ++ __u32 flags; ++ __u8 gen; ++ __u8 oldest_gen; ++ __u8 data_type; ++ __u8 stripe_redundancy; ++ __u32 dirty_sectors; ++ __u32 cached_sectors; ++ __u64 io_time[2]; ++ __u32 stripe; ++ __u32 nr_external_backpointers; ++ struct bpos backpointers[0]; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) ++LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) ++ ++BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) ++BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) ++BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) ++BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) ++ +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x -+ BCH_ALLOC_FIELD_NR +}; + +/* Quotas: */ @@ -5505,6 +6273,15 @@ index 000000000000..b115bd1fa5a3 +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + ++/* LRU btree: */ ++ ++struct bch_lru { ++ struct bch_val v; ++ __le64 idx; ++} __attribute__((packed, aligned(8))); ++ ++#define LRU_ID_STRIPES (1U << 16) ++ +/* Optional/variable size superblock sections: */ + +struct bch_sb_field { @@ -5513,16 +6290,17 @@ index 000000000000..b115bd1fa5a3 + __le32 type; +}; + -+#define BCH_SB_FIELDS() \ -+ x(journal, 0) \ -+ x(members, 1) \ -+ x(crypt, 2) \ -+ x(replicas_v0, 3) \ -+ x(quota, 4) \ -+ x(disk_groups, 5) \ -+ x(clean, 6) \ -+ x(replicas, 7) \ -+ x(journal_seq_blacklist, 8) ++#define BCH_SB_FIELDS() \ ++ x(journal, 0) \ ++ x(members, 1) \ ++ x(crypt, 2) \ ++ x(replicas_v0, 3) \ ++ x(quota, 4) \ ++ x(disk_groups, 5) \ ++ x(clean, 6) \ ++ x(replicas, 7) \ ++ x(journal_seq_blacklist, 8) \ ++ x(journal_v2, 9) + +enum bch_sb_field_type { +#define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -5531,6 +6309,14 @@ index 000000000000..b115bd1fa5a3 + BCH_SB_FIELD_NR +}; + ++/* ++ * Most superblock fields are replicated in all device's superblocks - a few are ++ * not: ++ */ ++#define BCH_SINGLE_DEVICE_SB_FIELDS \ ++ ((1U << BCH_SB_FIELD_journal)| \ ++ (1U << BCH_SB_FIELD_journal_v2)) ++ +/* BCH_SB_FIELD_journal: */ + +struct bch_sb_field_journal { @@ -5538,6 +6324,15 @@ index 000000000000..b115bd1fa5a3 + __le64 buckets[0]; +}; + ++struct bch_sb_field_journal_v2 { ++ struct bch_sb_field field; ++ ++ struct bch_sb_field_journal_v2_entry { ++ __le64 start; ++ __le64 nr; ++ } d[0]; ++}; ++ +/* BCH_SB_FIELD_members: */ + +#define BCH_MIN_NR_NBUCKETS (1 << 6) @@ -5554,12 +6349,13 @@ index 000000000000..b115bd1fa5a3 +}; + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -+/* 4-10 unused, was TIER, HAS_(META)DATA */ -+LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) ++/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) ++LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, ++ struct bch_member, flags[0], 30, 31) + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); @@ -5579,18 +6375,6 @@ index 000000000000..b115bd1fa5a3 + BCH_MEMBER_STATE_NR +}; + -+#define BCH_CACHE_REPLACEMENT_POLICIES() \ -+ x(lru, 0) \ -+ x(fifo, 1) \ -+ x(random, 2) -+ -+enum bch_cache_replacement_policies { -+#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n, -+ BCH_CACHE_REPLACEMENT_POLICIES() -+#undef x -+ BCH_CACHE_REPLACEMENT_NR -+}; -+ +struct bch_sb_field_members { + struct bch_sb_field field; + struct bch_member members[0]; @@ -5778,19 +6562,25 @@ index 000000000000..b115bd1fa5a3 +#define BCH_JSET_VERSION_OLD 2 +#define BCH_BSET_VERSION_OLD 3 + ++#define BCH_METADATA_VERSIONS() \ ++ x(bkey_renumber, 10) \ ++ x(inode_btree_change, 11) \ ++ x(snapshot, 12) \ ++ x(inode_backpointers, 13) \ ++ x(btree_ptr_sectors_written, 14) \ ++ x(snapshot_2, 15) \ ++ x(reflink_p_fix, 16) \ ++ x(subvol_dirent, 17) \ ++ x(inode_v2, 18) \ ++ x(freespace, 19) \ ++ x(alloc_v4, 20) ++ +enum bcachefs_metadata_version { -+ bcachefs_metadata_version_min = 9, -+ bcachefs_metadata_version_new_versioning = 10, -+ bcachefs_metadata_version_bkey_renumber = 10, -+ bcachefs_metadata_version_inode_btree_change = 11, -+ bcachefs_metadata_version_snapshot = 12, -+ bcachefs_metadata_version_inode_backpointers = 13, -+ bcachefs_metadata_version_btree_ptr_sectors_written = 14, -+ bcachefs_metadata_version_snapshot_2 = 15, -+ bcachefs_metadata_version_reflink_p_fix = 16, -+ bcachefs_metadata_version_subvol_dirent = 17, -+ bcachefs_metadata_version_inode_v2 = 18, -+ bcachefs_metadata_version_max = 19, ++ bcachefs_metadata_version_min = 9, ++#define x(t, n) bcachefs_metadata_version_##t = n, ++ BCH_METADATA_VERSIONS() ++#undef x ++ bcachefs_metadata_version_max +}; + +#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -5927,6 +6717,10 @@ index 000000000000..b115bd1fa5a3 +LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); +LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); +LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); ++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); ++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); ++LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); ++LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); + +/* + * Features: @@ -6161,7 +6955,8 @@ index 000000000000..b115bd1fa5a3 + x(usage, 5) \ + x(data_usage, 6) \ + x(clock, 7) \ -+ x(dev_usage, 8) ++ x(dev_usage, 8) \ ++ x(log, 9) + +enum { +#define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -6191,11 +6986,16 @@ index 000000000000..b115bd1fa5a3 + __le64 end; +}; + ++#define BCH_FS_USAGE_TYPES() \ ++ x(reserved, 0) \ ++ x(inodes, 1) \ ++ x(key_version, 2) ++ +enum { -+ FS_USAGE_RESERVED = 0, -+ FS_USAGE_INODES = 1, -+ FS_USAGE_KEY_VERSION = 2, -+ FS_USAGE_NR = 3 ++#define x(f, nr) BCH_FS_USAGE_##f = nr, ++ BCH_FS_USAGE_TYPES() ++#undef x ++ BCH_FS_USAGE_NR +}; + +struct jset_entry_usage { @@ -6233,6 +7033,17 @@ index 000000000000..b115bd1fa5a3 + struct jset_entry_dev_usage_type d[]; +} __attribute__((packed)); + ++static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) ++{ ++ return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / ++ sizeof(struct jset_entry_dev_usage_type); ++} ++ ++struct jset_entry_log { ++ struct jset_entry entry; ++ u8 d[]; ++} __attribute__((packed)); ++ +/* + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique @@ -6286,7 +7097,11 @@ index 000000000000..b115bd1fa5a3 + x(stripes, 6) \ + x(reflink, 7) \ + x(subvolumes, 8) \ -+ x(snapshots, 9) ++ x(snapshots, 9) \ ++ x(lru, 10) \ ++ x(freespace, 11) \ ++ x(need_discard, 12) \ ++ x(backpointers, 13) + +enum btree_id { +#define x(kwd, val) BTREE_ID_##kwd = val, @@ -6757,10 +7572,10 @@ index 000000000000..930981ad5535 +#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c new file mode 100644 -index 000000000000..946dd27f09fc +index 000000000000..4b01ab3029a2 --- /dev/null +++ b/fs/bcachefs/bkey.c -@@ -0,0 +1,1171 @@ +@@ -0,0 +1,1172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -6820,11 +7635,12 @@ index 000000000000..946dd27f09fc + tmp = __bch2_bkey_unpack_key(format, packed); + + if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { -+ char buf1[160], buf2[160]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + char buf3[160], buf4[160]; + -+ bch2_bkey_to_text(&PBUF(buf1), unpacked); -+ bch2_bkey_to_text(&PBUF(buf2), &tmp); ++ bch2_bkey_to_text(&buf1, unpacked); ++ bch2_bkey_to_text(&buf2, &tmp); + bch2_to_binary(buf3, (void *) unpacked, 80); + bch2_to_binary(buf4, high_word(format, packed), 80); + @@ -6835,7 +7651,7 @@ index 000000000000..946dd27f09fc + format->bits_per_field[2], + format->bits_per_field[3], + format->bits_per_field[4], -+ buf1, buf2, buf3, buf4); ++ buf1.buf, buf2.buf, buf3, buf4); + } +} + @@ -8572,10 +9388,10 @@ index 000000000000..0d7c67a959af +#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 -index 000000000000..5c900cf8a8a2 +index 000000000000..0eac86e5e776 --- /dev/null +++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,450 @@ +@@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -8587,6 +9403,7 @@ index 000000000000..5c900cf8a8a2 +#include "error.h" +#include "extents.h" +#include "inode.h" ++#include "lru.h" +#include "quota.h" +#include "reflink.h" +#include "subvolume.h" @@ -8663,6 +9480,24 @@ index 000000000000..5c900cf8a8a2 + .val_to_text = key_type_inline_data_to_text, \ +} + ++static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k)) ++ return "nonempty value"; ++ return NULL; ++} ++ ++static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) ++{ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ return true; ++} ++ ++#define bch2_bkey_ops_set (struct bkey_ops) { \ ++ .key_invalid = key_type_set_invalid, \ ++ .key_merge = key_type_set_merge, \ ++} ++ +const struct bkey_ops bch2_bkey_ops[] = { +#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, + BCH_BKEY_TYPES() @@ -8708,7 +9543,8 @@ index 000000000000..5c900cf8a8a2 + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_alloc)| + (1U << KEY_TYPE_alloc_v2)| -+ (1U << KEY_TYPE_alloc_v3), ++ (1U << KEY_TYPE_alloc_v3)| ++ (1U << KEY_TYPE_alloc_v4), + [BKEY_TYPE_quotas] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_quota), @@ -8725,6 +9561,15 @@ index 000000000000..5c900cf8a8a2 + [BKEY_TYPE_snapshots] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_snapshot), ++ [BKEY_TYPE_lru] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_lru), ++ [BKEY_TYPE_freespace] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_set), ++ [BKEY_TYPE_need_discard] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_set), + [BKEY_TYPE_btree] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_btree_ptr)| @@ -8790,22 +9635,6 @@ index 000000000000..5c900cf8a8a2 + return NULL; +} + -+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) -+{ -+ const char *invalid; -+ -+ BUG_ON(!k.k->u64s); -+ -+ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: -+ bch2_bkey_in_btree_node(b, k); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); -+ } -+} -+ +void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) +{ + if (!bpos_cmp(pos, POS_MIN)) @@ -9028,10 +9857,10 @@ index 000000000000..5c900cf8a8a2 +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h new file mode 100644 -index 000000000000..3012035db1a3 +index 000000000000..2289a09d98fc --- /dev/null +++ b/fs/bcachefs/bkey_methods.h -@@ -0,0 +1,80 @@ +@@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_METHODS_H +#define _BCACHEFS_BKEY_METHODS_H @@ -9040,6 +9869,7 @@ index 000000000000..3012035db1a3 + +struct bch_fs; +struct btree; ++struct btree_trans; +struct bkey; +enum btree_node_type; + @@ -9054,6 +9884,10 @@ index 000000000000..3012035db1a3 + void (*swab)(struct bkey_s); + bool (*key_normalize)(struct bch_fs *, struct bkey_s); + bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); ++ int (*trans_trigger)(struct btree_trans *, struct bkey_s_c, ++ struct bkey_i *, unsigned); ++ int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c, ++ struct bkey_s_c, unsigned); + void (*compat)(enum btree_id id, unsigned version, + unsigned big_endian, int write, + struct bkey_s); @@ -9068,8 +9902,6 @@ index 000000000000..3012035db1a3 + enum btree_node_type); +const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); + -+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -+ +void bch2_bpos_to_text(struct printbuf *, struct bpos); +void bch2_bkey_to_text(struct printbuf *, const struct bkey *); +void bch2_val_to_text(struct printbuf *, struct bch_fs *, @@ -9093,6 +9925,28 @@ index 000000000000..3012035db1a3 + +bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + ++static inline int bch2_mark_key(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_s_c new, ++ unsigned flags) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type]; ++ ++ return ops->atomic_trigger ++ ? ops->atomic_trigger(trans, old, new, flags) ++ : 0; ++} ++ ++static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, ++ struct bkey_i *new, unsigned flags) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type]; ++ ++ return ops->trans_trigger ++ ? ops->trans_trigger(trans, old, new, flags) ++ : 0; ++} ++ +void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); + +void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, @@ -9114,10 +9968,10 @@ index 000000000000..3012035db1a3 +#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c new file mode 100644 -index 000000000000..537ab7919e88 +index 000000000000..b1385a77da11 --- /dev/null +++ b/fs/bcachefs/bkey_sort.c -@@ -0,0 +1,253 @@ +@@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" @@ -9237,23 +10091,6 @@ index 000000000000..537ab7919e88 + return nr; +} + -+static void extent_sort_append(struct bch_fs *c, -+ struct bkey_format *f, -+ struct btree_nr_keys *nr, -+ struct bkey_packed **out, -+ struct bkey_s k) -+{ -+ if (!bkey_deleted(k.k)) { -+ if (!bch2_bkey_pack_key(*out, k.k, f)) -+ memcpy_u64s_small(*out, k.k, BKEY_U64s); -+ -+ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); -+ -+ btree_keys_account_key_add(nr, 0, *out); -+ *out = bkey_next(*out); -+ } -+} -+ +/* Sort + repack in a new format: */ +struct btree_nr_keys +bch2_sort_repack(struct bset *dst, struct btree *src, @@ -9264,6 +10101,7 @@ index 000000000000..537ab7919e88 + struct bkey_format *in_f = &src->format; + struct bkey_packed *in, *out = vstruct_last(dst); + struct btree_nr_keys nr; ++ bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); + + memset(&nr, 0, sizeof(nr)); + @@ -9271,8 +10109,10 @@ index 000000000000..537ab7919e88 + if (filter_whiteouts && bkey_deleted(in)) + continue; + -+ if (bch2_bkey_transform(out_f, out, bkey_packed(in) -+ ? in_f : &bch2_bkey_format_current, in)) ++ if (!transform) ++ bkey_copy(out, in); ++ else if (bch2_bkey_transform(out_f, out, bkey_packed(in) ++ ? in_f : &bch2_bkey_format_current, in)) + out->format = KEY_FORMAT_LOCAL_BTREE; + else + bch2_bkey_unpack(src, (void *) out, in); @@ -9285,47 +10125,6 @@ index 000000000000..537ab7919e88 + return nr; +} + -+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ -+struct btree_nr_keys -+bch2_sort_repack_merge(struct bch_fs *c, -+ struct bset *dst, struct btree *src, -+ struct btree_node_iter *iter, -+ struct bkey_format *out_f, -+ bool filter_whiteouts) -+{ -+ struct bkey_packed *out = vstruct_last(dst), *k_packed; -+ struct bkey_buf k; -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ bch2_bkey_buf_init(&k); -+ -+ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { -+ if (filter_whiteouts && bkey_deleted(k_packed)) -+ continue; -+ -+ /* -+ * NOTE: -+ * bch2_bkey_normalize may modify the key we pass it (dropping -+ * stale pointers) and we don't have a write lock on the src -+ * node; we have to make a copy of the entire key before calling -+ * normalize -+ */ -+ bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s); -+ bch2_bkey_unpack(src, k.k, k_packed); -+ -+ if (filter_whiteouts && -+ bch2_bkey_normalize(c, bkey_i_to_s(k.k))) -+ continue; -+ -+ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ bch2_bkey_buf_exit(&k, c); -+ return nr; -+} -+ +static inline int sort_keys_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) @@ -9373,10 +10172,10 @@ index 000000000000..537ab7919e88 +} diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h new file mode 100644 -index 000000000000..1059996dac78 +index 000000000000..79cf11d1b4e7 --- /dev/null +++ b/fs/bcachefs/bkey_sort.h -@@ -0,0 +1,49 @@ +@@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_SORT_H +#define _BCACHEFS_BKEY_SORT_H @@ -9416,11 +10215,6 @@ index 000000000000..1059996dac78 +bch2_sort_repack(struct bset *, struct btree *, + struct btree_node_iter *, + struct bkey_format *, bool); -+struct btree_nr_keys -+bch2_sort_repack_merge(struct bch_fs *, -+ struct bset *, struct btree *, -+ struct btree_node_iter *, -+ struct bkey_format *, bool); + +unsigned bch2_sort_keys(struct bkey_packed *, + struct sort_iter *, bool); @@ -9428,10 +10222,10 @@ index 000000000000..1059996dac78 +#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c new file mode 100644 -index 000000000000..59e4c1d1a2a5 +index 000000000000..c7a41d0dc781 --- /dev/null +++ b/fs/bcachefs/bset.c -@@ -0,0 +1,1712 @@ +@@ -0,0 +1,1598 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for working with individual keys, and sorted sets of keys with in a @@ -9504,7 +10298,7 @@ index 000000000000..59e4c1d1a2a5 + struct bkey_packed *_k, *_n; + struct bkey uk, n; + struct bkey_s_c k; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + + if (!i->u64s) + return; @@ -9515,12 +10309,14 @@ index 000000000000..59e4c1d1a2a5 + _n = bkey_next(_k); + + k = bkey_disassemble(b, _k, &uk); ++ ++ printbuf_reset(&buf); + if (c) -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_bkey_val_to_text(&buf, c, k); + else -+ bch2_bkey_to_text(&PBUF(buf), k.k); ++ bch2_bkey_to_text(&buf, k.k); + printk(KERN_ERR "block %u key %5zu: %s\n", set, -+ _k->_data - i->_data, buf); ++ _k->_data - i->_data, buf.buf); + + if (_n == vstruct_last(i)) + continue; @@ -9536,6 +10332,8 @@ index 000000000000..59e4c1d1a2a5 + !bpos_cmp(n.p, k.k->p)) + printk(KERN_ERR "Duplicate keys\n"); + } ++ ++ printbuf_exit(&buf); +} + +void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) @@ -9552,6 +10350,7 @@ index 000000000000..59e4c1d1a2a5 + struct btree_node_iter *iter) +{ + struct btree_node_iter_set *set; ++ struct printbuf buf = PRINTBUF; + + printk(KERN_ERR "btree node iter with %u/%u sets:\n", + __btree_node_iter_used(iter), b->nsets); @@ -9560,12 +10359,14 @@ index 000000000000..59e4c1d1a2a5 + struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey uk = bkey_unpack_key(b, k); -+ char buf[100]; + -+ bch2_bkey_to_text(&PBUF(buf), &uk); ++ printbuf_reset(&buf); ++ bch2_bkey_to_text(&buf, &uk); + printk(KERN_ERR "set %zu key %u: %s\n", -+ t - b->set, set->k, buf); ++ t - b->set, set->k, buf.buf); + } ++ ++ printbuf_exit(&buf); +} + +#ifdef CONFIG_BCACHEFS_DEBUG @@ -9601,13 +10402,14 @@ index 000000000000..59e4c1d1a2a5 + struct btree_node_iter_set *set; + struct bkey ku = bkey_unpack_key(b, k); + struct bkey nu = bkey_unpack_key(b, n); -+ char buf1[80], buf2[80]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + + bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &ku); -+ bch2_bkey_to_text(&PBUF(buf2), &nu); ++ bch2_bkey_to_text(&buf1, &ku); ++ bch2_bkey_to_text(&buf2, &nu); + printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", -+ buf1, buf2); ++ buf1.buf, buf2.buf); + printk(KERN_ERR "iter was:"); + + btree_node_iter_for_each(_iter, set) { @@ -9672,6 +10474,8 @@ index 000000000000..59e4c1d1a2a5 + struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); + struct bkey_packed *next = (void *) (where->_data + clobber_u64s); ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; +#if 0 + BUG_ON(prev && + bkey_iter_cmp(b, prev, insert) > 0); @@ -9680,17 +10484,15 @@ index 000000000000..59e4c1d1a2a5 + bkey_iter_cmp(b, prev, insert) > 0) { + struct bkey k1 = bkey_unpack_key(b, prev); + struct bkey k2 = bkey_unpack_key(b, insert); -+ char buf1[100]; -+ char buf2[100]; + + bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &k1); -+ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ bch2_bkey_to_text(&buf1, &k1); ++ bch2_bkey_to_text(&buf2, &k2); + + panic("prev > insert:\n" + "prev key %s\n" + "insert key %s\n", -+ buf1, buf2); ++ buf1.buf, buf2.buf); + } +#endif +#if 0 @@ -9701,17 +10503,15 @@ index 000000000000..59e4c1d1a2a5 + bkey_iter_cmp(b, insert, next) > 0) { + struct bkey k1 = bkey_unpack_key(b, insert); + struct bkey k2 = bkey_unpack_key(b, next); -+ char buf1[100]; -+ char buf2[100]; + + bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &k1); -+ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ bch2_bkey_to_text(&buf1, &k1); ++ bch2_bkey_to_text(&buf2, &k2); + + panic("insert > next:\n" + "insert key %s\n" + "next key %s\n", -+ buf1, buf2); ++ buf1.buf, buf2.buf); + } +#endif +} @@ -9907,7 +10707,7 @@ index 000000000000..59e4c1d1a2a5 + unsigned j) +{ + return cacheline_to_bkey(b, t, -+ __eytzinger1_to_inorder(j, t->size, t->extra), ++ __eytzinger1_to_inorder(j, t->size - 1, t->extra), + bkey_float(b, t, j)->key_offset); +} + @@ -10041,10 +10841,10 @@ index 000000000000..59e4c1d1a2a5 +} + +__always_inline -+static inline void __make_bfloat(struct btree *b, struct bset_tree *t, -+ unsigned j, -+ struct bkey_packed *min_key, -+ struct bkey_packed *max_key) ++static inline void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) +{ + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); @@ -10113,34 +10913,6 @@ index 000000000000..59e4c1d1a2a5 + f->mantissa = mantissa; +} + -+static void make_bfloat(struct btree *b, struct bset_tree *t, -+ unsigned j, -+ struct bkey_packed *min_key, -+ struct bkey_packed *max_key) -+{ -+ struct bkey_i *k; -+ -+ if (is_power_of_2(j) && -+ !min_key->u64s) { -+ if (!bkey_pack_pos(min_key, b->data->min_key, b)) { -+ k = (void *) min_key; -+ bkey_init(&k->k); -+ k->k.p = b->data->min_key; -+ } -+ } -+ -+ if (is_power_of_2(j + 1) && -+ !max_key->u64s) { -+ if (!bkey_pack_pos(max_key, b->data->max_key, b)) { -+ k = (void *) max_key; -+ bkey_init(&k->k); -+ k->k.p = b->data->max_key; -+ } -+ } -+ -+ __make_bfloat(b, t, j, min_key, max_key); -+} -+ +/* bytes remaining - only valid for last bset: */ +static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ @@ -10197,7 +10969,7 @@ index 000000000000..59e4c1d1a2a5 + t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + + /* First we figure out where the first key in each cacheline is */ -+ eytzinger1_for_each(j, t->size) { ++ eytzinger1_for_each(j, t->size - 1) { + while (bkey_to_cacheline(b, t, k) < cacheline) + prev = k, k = bkey_next(k); + @@ -10229,10 +11001,10 @@ index 000000000000..59e4c1d1a2a5 + } + + /* Then we build the tree */ -+ eytzinger1_for_each(j, t->size) -+ __make_bfloat(b, t, j, -+ bkey_to_packed(&min_key), -+ bkey_to_packed(&max_key)); ++ eytzinger1_for_each(j, t->size - 1) ++ make_bfloat(b, t, j, ++ bkey_to_packed(&min_key), ++ bkey_to_packed(&max_key)); +} + +static void bset_alloc_tree(struct btree *b, struct bset_tree *t) @@ -10331,7 +11103,7 @@ index 000000000000..59e4c1d1a2a5 + do { + p = j ? tree_to_bkey(b, t, + __inorder_to_eytzinger1(j--, -+ t->size, t->extra)) ++ t->size - 1, t->extra)) + : btree_bkey_first(b, t); + } while (p >= k); + break; @@ -10377,91 +11149,6 @@ index 000000000000..59e4c1d1a2a5 + +/* Insert */ + -+static void rw_aux_tree_fix_invalidated_key(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ unsigned offset = __btree_node_key_to_offset(b, k); -+ unsigned j = rw_aux_tree_bsearch(b, t, offset); -+ -+ if (j < t->size && -+ rw_aux_tree(b, t)[j].offset == offset) -+ rw_aux_tree_set(b, t, j, k); -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+} -+ -+static void ro_aux_tree_fix_invalidated_key(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct bkey_packed min_key, max_key; -+ unsigned inorder, j; -+ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ /* signal to make_bfloat() that they're uninitialized: */ -+ min_key.u64s = max_key.u64s = 0; -+ -+ if (bkey_next(k) == btree_bkey_last(b, t)) { -+ for (j = 1; j < t->size; j = j * 2 + 1) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ -+ inorder = bkey_to_cacheline(b, t, k); -+ -+ if (inorder && -+ inorder < t->size) { -+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); -+ -+ if (k == tree_to_bkey(b, t, j)) { -+ /* Fix the node this key corresponds to */ -+ make_bfloat(b, t, j, &min_key, &max_key); -+ -+ /* Children for which this key is the right boundary */ -+ for (j = eytzinger1_left_child(j); -+ j < t->size; -+ j = eytzinger1_right_child(j)) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ } -+ -+ if (inorder + 1 < t->size) { -+ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); -+ -+ if (k == tree_to_prev_bkey(b, t, j)) { -+ make_bfloat(b, t, j, &min_key, &max_key); -+ -+ /* Children for which this key is the left boundary */ -+ for (j = eytzinger1_right_child(j); -+ j < t->size; -+ j = eytzinger1_left_child(j)) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ } -+} -+ -+/** -+ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been -+ * modified, fix any auxiliary search tree by remaking all the nodes in the -+ * auxiliary search tree that @k corresponds to -+ */ -+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ break; -+ case BSET_RO_AUX_TREE: -+ ro_aux_tree_fix_invalidated_key(b, t, k); -+ break; -+ case BSET_RW_AUX_TREE: -+ rw_aux_tree_fix_invalidated_key(b, t, k); -+ break; -+ } -+} -+ +static void bch2_bset_fix_lookup_table(struct btree *b, + struct bset_tree *t, + struct bkey_packed *_where, @@ -10696,7 +11383,7 @@ index 000000000000..59e4c1d1a2a5 + n = n * 2 + (cmp < 0); + } while (n < t->size); + -+ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); ++ inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); + + /* + * n would have been the node we recursed to - the low bit tells us if @@ -10707,7 +11394,7 @@ index 000000000000..59e4c1d1a2a5 + if (unlikely(!inorder)) + return btree_bkey_first(b, t); + -+ f = &base->f[eytzinger1_prev(n >> 1, t->size)]; ++ f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; + } + + return cacheline_to_bkey(b, t, inorder, f->key_offset); @@ -10981,10 +11668,6 @@ index 000000000000..59e4c1d1a2a5 + + EBUG_ON(iter->data->k > iter->data->end); + -+ while (!__btree_node_iter_set_end(iter, 0) && -+ !__bch2_btree_node_iter_peek_all(iter, b)->u64s) -+ iter->data->k++; -+ + if (unlikely(__btree_node_iter_set_end(iter, 0))) { + bch2_btree_node_iter_set_drop(iter, iter->data); + return; @@ -11118,9 +11801,6 @@ index 000000000000..59e4c1d1a2a5 + struct bkey uk; + unsigned j, inorder; + -+ if (out->pos != out->end) -+ *out->pos = '\0'; -+ + if (!bset_has_ro_aux_tree(t)) + return; + @@ -11128,7 +11808,7 @@ index 000000000000..59e4c1d1a2a5 + if (!inorder || inorder >= t->size) + return; + -+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); + if (k != tree_to_bkey(b, t, j)) + return; + @@ -11146,10 +11826,10 @@ index 000000000000..59e4c1d1a2a5 +} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h new file mode 100644 -index 000000000000..e42f866cf2ec +index 000000000000..0d46534c3dcd --- /dev/null +++ b/fs/bcachefs/bset.h -@@ -0,0 +1,616 @@ +@@ -0,0 +1,615 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BSET_H +#define _BCACHEFS_BSET_H @@ -11513,7 +12193,6 @@ index 000000000000..e42f866cf2ec +void bch2_bset_init_next(struct bch_fs *, struct btree *, + struct btree_node_entry *); +void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); + +void bch2_bset_insert(struct btree *, struct btree_node_iter *, + struct bkey_packed *, struct bkey_i *, unsigned); @@ -11768,10 +12447,10 @@ index 000000000000..e42f866cf2ec +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 -index 000000000000..5ae61e5d3923 +index 000000000000..0dcdc30c6888 --- /dev/null +++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1095 @@ +@@ -0,0 +1,1160 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -11789,6 +12468,13 @@ index 000000000000..5ae61e5d3923 + +struct lock_class_key bch2_btree_node_lock_key; + ++const char * const bch2_btree_node_flags[] = { ++#define x(f) #f, ++ BTREE_FLAGS() ++#undef x ++ NULL ++}; ++ +void bch2_recalc_btree_reserve(struct bch_fs *c) +{ + unsigned i, reserve = 16; @@ -11809,6 +12495,14 @@ index 000000000000..5ae61e5d3923 + return max_t(int, 0, bc->used - bc->reserve); +} + ++static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) ++{ ++ if (b->c.lock.readers) ++ list_move(&b->list, &bc->freed_pcpu); ++ else ++ list_move(&b->list, &bc->freed_nonpcpu); ++} ++ +static void btree_node_data_free(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; @@ -11825,7 +12519,8 @@ index 000000000000..5ae61e5d3923 + b->aux_data = NULL; + + bc->used--; -+ list_move(&b->list, &bc->freed); ++ ++ btree_node_to_freedlist(bc, b); +} + +static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -11857,6 +12552,8 @@ index 000000000000..5ae61e5d3923 + b->aux_data = mmap(NULL, btree_aux_data_bytes(b), + PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); ++ if (b->aux_data == MAP_FAILED) ++ b->aux_data = NULL; +#endif + if (!b->aux_data) { + kvpfree(b->data, btree_bytes(c)); @@ -11928,11 +12625,6 @@ index 000000000000..5ae61e5d3923 + b->c.level = level; + b->c.btree_id = id; + -+ if (level) -+ six_lock_pcpu_alloc(&b->c.lock); -+ else -+ six_lock_pcpu_free_rcu(&b->c.lock); -+ + mutex_lock(&bc->lock); + ret = __bch2_btree_node_hash_insert(bc, b); + if (!ret) @@ -11989,15 +12681,13 @@ index 000000000000..5ae61e5d3923 + goto wait_on_io; + } + -+ if (btree_node_noevict(b)) -+ goto out_unlock; -+ -+ if (!btree_node_may_write(b)) ++ if (btree_node_noevict(b) || ++ btree_node_write_blocked(b) || ++ btree_node_will_make_reachable(b)) + goto out_unlock; + + if (btree_node_dirty(b)) { -+ if (!flush || -+ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ if (!flush) + goto out_unlock; + /* + * Using the underscore version because we don't want to compact @@ -12006,9 +12696,9 @@ index 000000000000..5ae61e5d3923 + * the post write cleanup: + */ + if (bch2_verify_btree_ondisk) -+ bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ bch2_btree_node_write(c, b, SIX_LOCK_intent, 0); + else -+ __bch2_btree_node_write(c, b, false); ++ __bch2_btree_node_write(c, b, 0); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); @@ -12048,6 +12738,7 @@ index 000000000000..5ae61e5d3923 + unsigned long touched = 0; + unsigned long freed = 0; + unsigned i, flags; ++ unsigned long ret = SHRINK_STOP; + + if (bch2_btree_shrinker_disabled) + return SHRINK_STOP; @@ -12056,7 +12747,7 @@ index 000000000000..5ae61e5d3923 + if (sc->gfp_mask & __GFP_FS) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) -+ return -1; ++ goto out_norestore; + + flags = memalloc_nofs_save(); + @@ -12073,13 +12764,19 @@ index 000000000000..5ae61e5d3923 + + i = 0; + list_for_each_entry_safe(b, t, &bc->freeable, list) { ++ /* ++ * Leave a few nodes on the freeable list, so that a btree split ++ * won't have to hit the system allocator: ++ */ ++ if (++i <= 3) ++ continue; ++ + touched++; + + if (touched >= nr) + break; + -+ if (++i > 3 && -+ !btree_node_reclaim(c, b)) { ++ if (!btree_node_reclaim(c, b)) { + btree_node_data_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); @@ -12088,17 +12785,13 @@ index 000000000000..5ae61e5d3923 + } +restart: + list_for_each_entry_safe(b, t, &bc->live, list) { -+ touched++; -+ -+ if (touched >= nr) { -+ /* Save position */ -+ if (&t->list != &bc->live) -+ list_move_tail(&bc->live, &t->list); -+ break; ++ /* tweak this */ ++ if (btree_node_accessed(b)) { ++ clear_btree_node_accessed(b); ++ goto touched; + } + -+ if (!btree_node_accessed(b) && -+ !btree_node_reclaim(c, b)) { ++ if (!btree_node_reclaim(c, b)) { + /* can't call bch2_btree_node_hash_remove under lock */ + freed++; + if (&t->list != &bc->live) @@ -12119,14 +12812,30 @@ index 000000000000..5ae61e5d3923 + else if (!mutex_trylock(&bc->lock)) + goto out; + goto restart; -+ } else -+ clear_btree_node_accessed(b); ++ } else { ++ continue; ++ } ++touched: ++ touched++; ++ ++ if (touched >= nr) { ++ /* Save position */ ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ break; ++ } + } + + mutex_unlock(&bc->lock); +out: ++ ret = (unsigned long) freed * btree_pages(c); + memalloc_nofs_restore(flags); -+ return (unsigned long) freed * btree_pages(c); ++out_norestore: ++ trace_btree_cache_scan(sc->nr_to_scan, ++ sc->nr_to_scan / btree_pages(c), ++ btree_cache_can_free(bc), ++ ret); ++ return ret; +} + +static unsigned long bch2_btree_cache_count(struct shrinker *shrink, @@ -12174,15 +12883,17 @@ index 000000000000..5ae61e5d3923 + + if (btree_node_dirty(b)) + bch2_btree_complete_write(c, b, btree_current_write(b)); -+ clear_btree_node_dirty(c, b); ++ clear_btree_node_dirty_acct(c, b); + + btree_node_data_free(c, b); + } + + BUG_ON(atomic_read(&c->btree_cache.dirty)); + -+ while (!list_empty(&bc->freed)) { -+ b = list_first_entry(&bc->freed, struct btree, list); ++ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); ++ ++ while (!list_empty(&bc->freed_nonpcpu)) { ++ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_del(&b->list); + six_lock_pcpu_free(&b->c.lock); + kfree(b); @@ -12236,7 +12947,8 @@ index 000000000000..5ae61e5d3923 + mutex_init(&bc->lock); + INIT_LIST_HEAD(&bc->live); + INIT_LIST_HEAD(&bc->freeable); -+ INIT_LIST_HEAD(&bc->freed); ++ INIT_LIST_HEAD(&bc->freed_pcpu); ++ INIT_LIST_HEAD(&bc->freed_nonpcpu); +} + +/* @@ -12311,10 +13023,13 @@ index 000000000000..5ae61e5d3923 + } +} + -+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) +{ + struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; ++ struct list_head *freed = pcpu_read_locks ++ ? &bc->freed_pcpu ++ : &bc->freed_nonpcpu; ++ struct btree *b, *b2; + u64 start_time = local_clock(); + unsigned flags; + @@ -12322,44 +13037,49 @@ index 000000000000..5ae61e5d3923 + mutex_lock(&bc->lock); + + /* -+ * btree_free() doesn't free memory; it sticks the node on the end of -+ * the list. Check if there's any freed nodes there: -+ */ -+ list_for_each_entry(b, &bc->freeable, list) -+ if (!btree_node_reclaim(c, b)) -+ goto got_node; -+ -+ /* + * We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ -+ list_for_each_entry(b, &bc->freed, list) -+ if (!btree_node_reclaim(c, b)) ++ list_for_each_entry(b, freed, list) ++ if (!btree_node_reclaim(c, b)) { ++ list_del_init(&b->list); + goto got_node; ++ } + -+ b = NULL; ++ b = __btree_node_mem_alloc(c); ++ if (!b) ++ goto err_locked; ++ ++ if (pcpu_read_locks) ++ six_lock_pcpu_alloc(&b->c.lock); ++ ++ BUG_ON(!six_trylock_intent(&b->c.lock)); ++ BUG_ON(!six_trylock_write(&b->c.lock)); +got_node: -+ if (b) -+ list_del_init(&b->list); ++ ++ /* ++ * btree_free() doesn't free memory; it sticks the node on the end of ++ * the list. Check if there's any freed nodes there: ++ */ ++ list_for_each_entry(b2, &bc->freeable, list) ++ if (!btree_node_reclaim(c, b2)) { ++ swap(b->data, b2->data); ++ swap(b->aux_data, b2->aux_data); ++ btree_node_to_freedlist(bc, b2); ++ six_unlock_write(&b2->c.lock); ++ six_unlock_intent(&b2->c.lock); ++ goto got_mem; ++ } ++ + mutex_unlock(&bc->lock); + -+ if (!b) { -+ b = __btree_node_mem_alloc(c); -+ if (!b) -+ goto err; ++ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) ++ goto err; + -+ BUG_ON(!six_trylock_intent(&b->c.lock)); -+ BUG_ON(!six_trylock_write(&b->c.lock)); -+ } -+ -+ if (!b->data) { -+ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) -+ goto err; -+ -+ mutex_lock(&bc->lock); -+ bc->used++; -+ mutex_unlock(&bc->lock); -+ } ++ mutex_lock(&bc->lock); ++ bc->used++; ++got_mem: ++ mutex_unlock(&bc->lock); + + BUG_ON(btree_node_hashed(b)); + BUG_ON(btree_node_dirty(b)); @@ -12381,20 +13101,24 @@ index 000000000000..5ae61e5d3923 + return b; +err: + mutex_lock(&bc->lock); -+ -+ if (b) { -+ list_add(&b->list, &bc->freed); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ } -+ ++err_locked: + /* Try to cannibalize another cached btree node: */ + if (bc->alloc_lock == current) { -+ b = btree_node_cannibalize(c); -+ list_del_init(&b->list); -+ mutex_unlock(&bc->lock); ++ b2 = btree_node_cannibalize(c); ++ bch2_btree_node_hash_remove(bc, b2); + -+ bch2_btree_node_hash_remove(bc, b); ++ if (b) { ++ swap(b->data, b2->data); ++ swap(b->aux_data, b2->aux_data); ++ btree_node_to_freedlist(bc, b2); ++ six_unlock_write(&b2->c.lock); ++ six_unlock_intent(&b2->c.lock); ++ } else { ++ b = b2; ++ list_del_init(&b->list); ++ } ++ ++ mutex_unlock(&bc->lock); + + trace_btree_node_cannibalize(c); + goto out; @@ -12425,11 +13149,22 @@ index 000000000000..5ae61e5d3923 + * been freed: + */ + if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { ++ trace_trans_restart_relock_parent_for_fill(trans->fn, ++ _THIS_IP_, btree_id, &path->pos); ++ btree_trans_restart(trans); ++ return ERR_PTR(-EINTR); ++ } ++ ++ b = bch2_btree_node_mem_alloc(c, level != 0); ++ ++ if (trans && b == ERR_PTR(-ENOMEM)) { ++ trans->memory_allocation_failure = true; ++ trace_trans_restart_memory_allocation_failure(trans->fn, ++ _THIS_IP_, btree_id, &path->pos); + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } + -+ b = bch2_btree_node_mem_alloc(c); + if (IS_ERR(b)) + return b; + @@ -12472,6 +13207,8 @@ index 000000000000..5ae61e5d3923 + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) { ++ trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_, ++ btree_id, &path->pos); + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } @@ -12489,14 +13226,16 @@ index 000000000000..5ae61e5d3923 + +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) +{ -+ char buf1[200], buf2[100], buf3[100]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf3 = PRINTBUF; + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + -+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key)); -+ bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); -+ bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&b->key)); ++ bch2_bpos_to_text(&buf2, b->data->min_key); ++ bch2_bpos_to_text(&buf3, b->data->max_key); + + bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" + "btree %s level %u\n" @@ -12504,10 +13243,14 @@ index 000000000000..5ae61e5d3923 + "header: btree %s level %llu\n" + "min %s max %s\n", + bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1, ++ buf1.buf, + bch2_btree_ids[BTREE_NODE_ID(b->data)], + BTREE_NODE_LEVEL(b->data), -+ buf2, buf3); ++ buf2.buf, buf3.buf); ++ ++ printbuf_exit(&buf3); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); +} + +static inline void btree_check_header(struct bch_fs *c, struct btree *b) @@ -12542,16 +13285,17 @@ index 000000000000..5ae61e5d3923 + + EBUG_ON(level >= BTREE_MAX_DEPTH); + -+ if (c->opts.btree_node_mem_ptr_optimization) { -+ b = btree_node_mem_ptr(k); -+ /* -+ * Check b->hash_val _before_ calling btree_node_lock() - this -+ * might not be the node we want anymore, and trying to lock the -+ * wrong node could cause an unneccessary transaction restart: -+ */ -+ if (b && b->hash_val == btree_ptr_hash_val(k)) ++ b = btree_node_mem_ptr(k); ++ ++ /* ++ * Check b->hash_val _before_ calling btree_node_lock() - this might not ++ * be the node we want anymore, and trying to lock the wrong node could ++ * cause an unneccessary transaction restart: ++ */ ++ if (likely(c->opts.btree_node_mem_ptr_optimization && ++ b && ++ b->hash_val == btree_ptr_hash_val(k))) + goto lock_node; -+ } +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { @@ -12616,7 +13360,7 @@ index 000000000000..5ae61e5d3923 + if (bch2_btree_node_relock(trans, path, level + 1)) + goto retry; + -+ trace_trans_restart_btree_node_reused(trans->ip, ++ trace_trans_restart_btree_node_reused(trans->fn, + trace_ip, + path->btree_id, + &path->pos); @@ -12798,7 +13542,7 @@ index 000000000000..5ae61e5d3923 + six_lock_write(&b->c.lock, NULL, NULL); + + if (btree_node_dirty(b)) { -+ __bch2_btree_node_write(c, b, false); ++ __bch2_btree_node_write(c, b, 0); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; @@ -12869,10 +13613,10 @@ index 000000000000..5ae61e5d3923 +} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h new file mode 100644 -index 000000000000..402cec1802bc +index 000000000000..25906127c023 --- /dev/null +++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,105 @@ +@@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_CACHE_H +#define _BCACHEFS_BTREE_CACHE_H @@ -12882,6 +13626,8 @@ index 000000000000..402cec1802bc + +extern struct lock_class_key bch2_btree_node_lock_key; + ++extern const char * const bch2_btree_node_flags[]; ++ +struct btree_iter; + +void bch2_recalc_btree_reserve(struct bch_fs *); @@ -12895,7 +13641,7 @@ index 000000000000..402cec1802bc +int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); + +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool); + +struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, + const struct bkey_i *, unsigned, @@ -12946,7 +13692,7 @@ index 000000000000..402cec1802bc + +static inline size_t btree_bytes(struct bch_fs *c) +{ -+ return c->opts.btree_node_size << 9; ++ return c->opts.btree_node_size; +} + +static inline size_t btree_max_u64s(struct bch_fs *c) @@ -12961,7 +13707,7 @@ index 000000000000..402cec1802bc + +static inline unsigned btree_blocks(struct bch_fs *c) +{ -+ return c->opts.btree_node_size >> c->block_bits; ++ return btree_sectors(c) >> c->block_bits; +} + +#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) @@ -12980,10 +13726,10 @@ index 000000000000..402cec1802bc +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 -index 000000000000..091bddee575d +index 000000000000..e19991796c82 --- /dev/null +++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,1952 @@ +@@ -0,0 +1,2102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -12995,6 +13741,7 @@ index 000000000000..091bddee575d +#include "alloc_foreground.h" +#include "bkey_methods.h" +#include "bkey_buf.h" ++#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update_interior.h" +#include "btree_io.h" @@ -13055,23 +13802,23 @@ index 000000000000..091bddee575d + struct bpos expected_start = bkey_deleted(&prev->k->k) + ? node_start + : bpos_successor(prev->k->k.p); -+ char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); + -+ if (bkey_deleted(&prev->k->k)) { -+ struct printbuf out = PBUF(buf1); -+ pr_buf(&out, "start of node: "); -+ bch2_bpos_to_text(&out, node_start); -+ } else { -+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); -+ } -+ + if (bpos_cmp(expected_start, bp->v.min_key)) { + bch2_topology_error(c); + ++ if (bkey_deleted(&prev->k->k)) { ++ pr_buf(&buf1, "start of node: "); ++ bch2_bpos_to_text(&buf1, node_start); ++ } else { ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); ++ } ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); ++ + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| @@ -13080,11 +13827,11 @@ index 000000000000..091bddee575d + " prev %s\n" + " cur %s", + bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1, -+ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) && ++ buf1.buf, buf2.buf) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); -+ return FSCK_ERR_START_TOPOLOGY_REPAIR; ++ ret = FSCK_ERR_START_TOPOLOGY_REPAIR; ++ goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + } @@ -13094,6 +13841,12 @@ index 000000000000..091bddee575d + if (is_last && bpos_cmp(cur.k->k.p, node_end)) { + bch2_topology_error(c); + ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); ++ bch2_bpos_to_text(&buf2, node_end); ++ + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| @@ -13102,18 +13855,21 @@ index 000000000000..091bddee575d + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, -+ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), -+ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) && ++ buf1.buf, buf2.buf) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); -+ return FSCK_ERR_START_TOPOLOGY_REPAIR; ++ ret = FSCK_ERR_START_TOPOLOGY_REPAIR; ++ goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + } + } + + bch2_bkey_buf_copy(prev, c, cur.k); ++err: +fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; +} + @@ -13141,6 +13897,34 @@ index 000000000000..091bddee575d + } +} + ++static void bch2_btree_node_update_key_early(struct bch_fs *c, ++ enum btree_id btree, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new) ++{ ++ struct btree *b; ++ struct bkey_buf tmp; ++ int ret; ++ ++ bch2_bkey_buf_init(&tmp); ++ bch2_bkey_buf_reassemble(&tmp, c, old); ++ ++ b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true); ++ if (!IS_ERR_OR_NULL(b)) { ++ mutex_lock(&c->btree_cache.lock); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, new); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ ++ mutex_unlock(&c->btree_cache.lock); ++ six_unlock_read(&b->c.lock); ++ } ++ ++ bch2_bkey_buf_exit(&tmp, c); ++} ++ +static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) +{ + struct bkey_i_btree_ptr_v2 *new; @@ -13155,7 +13939,7 @@ index 000000000000..091bddee575d + new->v.min_key = new_min; + SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); + -+ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); ++ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); + return ret; @@ -13184,7 +13968,7 @@ index 000000000000..091bddee575d + new->k.p = new_max; + SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); + -+ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); ++ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); + return ret; @@ -13208,18 +13992,17 @@ index 000000000000..091bddee575d + struct bpos expected_start = !prev + ? b->data->min_key + : bpos_successor(prev->key.k.p); -+ char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + if (!prev) { -+ struct printbuf out = PBUF(buf1); -+ pr_buf(&out, "start of node: "); -+ bch2_bpos_to_text(&out, b->data->min_key); ++ pr_buf(&buf1, "start of node: "); ++ bch2_bpos_to_text(&buf1, b->data->min_key); + } else { -+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key)); ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + } + -+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)); ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); + + if (prev && + bpos_cmp(expected_start, cur->data->min_key) > 0 && @@ -13232,8 +14015,10 @@ index 000000000000..091bddee575d + " node %s\n" + " next %s", + bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1, buf2)) -+ return DROP_PREV_NODE; ++ buf1.buf, buf2.buf)) { ++ ret = DROP_PREV_NODE; ++ goto out; ++ } + + if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p, + bpos_predecessor(cur->data->min_key)), c, @@ -13241,7 +14026,7 @@ index 000000000000..091bddee575d + " node %s\n" + " next %s", + bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1, buf2)) ++ buf1.buf, buf2.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } else { @@ -13253,39 +14038,49 @@ index 000000000000..091bddee575d + " prev %s\n" + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1, buf2)) -+ return DROP_THIS_NODE; ++ buf1.buf, buf2.buf)) { ++ ret = DROP_THIS_NODE; ++ goto out; ++ } + + if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1, buf2)) ++ buf1.buf, buf2.buf)) + ret = set_node_min(c, cur, expected_start); + } ++out: +fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; +} + +static int btree_repair_node_end(struct bch_fs *c, struct btree *b, + struct btree *child) +{ -+ char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); ++ bch2_bpos_to_text(&buf2, b->key.k.p); ++ + if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, -+ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1), -+ (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) { ++ buf1.buf, buf2.buf)) { + ret = set_node_max(c, child, b->key.k.p); + if (ret) -+ return ret; ++ goto err; + } ++err: +fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; +} + @@ -13296,7 +14091,7 @@ index 000000000000..091bddee575d + struct bkey_buf prev_k, cur_k; + struct btree *prev = NULL, *cur = NULL; + bool have_child, dropped_children = false; -+ char buf[200]; ++ struct printbuf buf; + int ret = 0; + + if (!b->c.level) @@ -13320,12 +14115,15 @@ index 000000000000..091bddee575d + false); + ret = PTR_ERR_OR_ZERO(cur); + ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); ++ + if (mustfix_fsck_err_on(ret == -EIO, c, + "Unreadable btree node at btree %s level %u:\n" + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) { ++ buf.buf)) { + bch2_btree_node_evict(c, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); @@ -13425,12 +14223,14 @@ index 000000000000..091bddee575d + have_child = true; + } + ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ + if (mustfix_fsck_err_on(!have_child, c, + "empty interior btree node at btree %s level %u\n" + " %s", + bch2_btree_ids[b->c.btree_id], -+ b->c.level, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf))) ++ b->c.level, buf.buf)) + ret = DROP_THIS_NODE; +err: +fsck_err: @@ -13446,6 +14246,7 @@ index 000000000000..091bddee575d + if (!ret && dropped_children) + goto again; + ++ printbuf_exit(&buf); + return ret; +} + @@ -13481,7 +14282,7 @@ index 000000000000..091bddee575d + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + /* @@ -13490,8 +14291,7 @@ index 000000000000..091bddee575d + */ + bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); -+ struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); ++ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); + + if (fsck_err_on(!g->gen_valid, c, @@ -13500,103 +14300,94 @@ index 000000000000..091bddee575d + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (!p.ptr.cached) { -+ g2->_mark.gen = g->_mark.gen = p.ptr.gen; -+ g2->gen_valid = g->gen_valid = true; -+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ g->gen_valid = true; ++ g->gen = p.ptr.gen; + } else { + do_update = true; + } + } + -+ if (fsck_err_on(data_type == BCH_DATA_btree && -+ g->mark.gen != p.ptr.gen, c, -+ "bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), -+ bch2_data_types[ptr_data_type(k->k, &p.ptr)], -+ p.ptr.gen, g->mark.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { -+ g2->_mark.data_type = g->_mark.data_type = data_type; -+ g2->gen_valid = g->gen_valid = true; -+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); -+ } -+ -+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, ++ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], -+ p.ptr.gen, g->mark.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ p.ptr.gen, g->gen, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (!p.ptr.cached) { -+ g2->_mark.gen = g->_mark.gen = p.ptr.gen; -+ g2->gen_valid = g->gen_valid = true; -+ g2->_mark.data_type = 0; -+ g2->_mark.dirty_sectors = 0; -+ g2->_mark.cached_sectors = 0; ++ g->gen_valid = true; ++ g->gen = p.ptr.gen; ++ g->data_type = 0; ++ g->dirty_sectors = 0; ++ g->cached_sectors = 0; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); -+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } + } + -+ if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, ++ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + + if (fsck_err_on(!p.ptr.cached && -+ gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, ++ gen_cmp(p.ptr.gen, g->gen) < 0, c, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], -+ p.ptr.gen, g->mark.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ p.ptr.gen, g->gen, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + -+ if (p.ptr.gen != g->mark.gen) ++ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + continue; + -+ if (fsck_err_on(g->mark.data_type && -+ g->mark.data_type != data_type, c, ++ if (fsck_err_on(g->data_type && ++ g->data_type != data_type, c, + "bucket %u:%zu different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), -+ bch2_data_types[g->mark.data_type], ++ bch2_data_types[g->data_type], + bch2_data_types[data_type], -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (data_type == BCH_DATA_btree) { -+ g2->_mark.data_type = g->_mark.data_type = data_type; -+ g2->gen_valid = g->gen_valid = true; -+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); ++ g->data_type = data_type; ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; + } + } + + if (p.has_ec) { -+ struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); ++ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + + if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + } + } @@ -13609,13 +14400,15 @@ index 000000000000..091bddee575d + + if (is_root) { + bch_err(c, "cannot update btree roots yet"); -+ return -EINVAL; ++ ret = -EINVAL; ++ goto err; + } + + new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + if (!new) { + bch_err(c, "%s: error allocating new key", __func__); -+ return -ENOMEM; ++ ret = -ENOMEM; ++ goto err; + } + + bkey_reassemble(new, *k); @@ -13629,29 +14422,29 @@ index 000000000000..091bddee575d + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g = PTR_GC_BUCKET(ca, ptr); + -+ ptr->gen = g->mark.gen; ++ ptr->gen = g->gen; + } + } else { + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g = PTR_GC_BUCKET(ca, ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); + + (ptr->cached && -+ (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || ++ (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || + (!ptr->cached && -+ gen_cmp(ptr->gen, g->mark.gen) < 0) || -+ gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX || -+ (g->mark.data_type && -+ g->mark.data_type != data_type); ++ gen_cmp(ptr->gen, g->gen) < 0) || ++ gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || ++ (g->data_type && ++ g->data_type != data_type); + })); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { -+ struct stripe *m = genradix_ptr(&c->stripes[true], ++ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + @@ -13676,13 +14469,28 @@ index 000000000000..091bddee575d + } + } + -+ ret = bch2_journal_key_insert(c, btree_id, level, new); -+ if (ret) ++ ret = bch2_journal_key_insert_take(c, btree_id, level, new); ++ if (ret) { + kfree(new); -+ else -+ *k = bkey_i_to_s_c(new); ++ goto err; ++ } ++ ++ if (level) ++ bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); ++ ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, *k); ++ bch_info(c, "updated %s", buf.buf); ++ ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); ++ bch_info(c, "new key %s", buf.buf); ++ ++ *k = bkey_i_to_s_c(new); + } ++err: +fsck_err: ++ printbuf_exit(&buf); + return ret; +} + @@ -13691,20 +14499,21 @@ index 000000000000..091bddee575d +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k, -+ u8 *max_stale, bool initial) ++ bool initial) +{ + struct bch_fs *c = trans->c; -+ struct bkey_ptrs_c ptrs; -+ const struct bch_extent_ptr *ptr; ++ struct bkey deleted = KEY(0, 0, 0); ++ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + unsigned flags = + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); -+ char buf[200]; + int ret = 0; + ++ deleted.p = k->k->p; ++ + if (initial) { + BUG_ON(bch2_journal_seq_verify && -+ k->k->version.lo > journal_cur_seq(&c->journal)); ++ k->k->version.lo > atomic64_read(&c->journal.seq)); + + ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); + if (ret) @@ -13715,32 +14524,10 @@ index 000000000000..091bddee575d + k->k->version.lo, + atomic64_read(&c->key_version))) + atomic64_set(&c->key_version, k->k->version.lo); -+ -+ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || -+ fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c, -+ "superblock not marked as containing replicas\n" -+ " while marking %s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { -+ ret = bch2_mark_bkey_replicas(c, *k); -+ if (ret) { -+ bch_err(c, "error marking bkey replicas: %i", ret); -+ goto err; -+ } -+ } + } + -+ ptrs = bch2_bkey_ptrs_c(*k); -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); -+ -+ if (gen_after(g->oldest_gen, ptr->gen)) -+ g->oldest_gen = ptr->gen; -+ -+ *max_stale = max(*max_stale, ptr_stale(ca, ptr)); -+ } -+ -+ ret = bch2_mark_key(trans, *k, flags); ++ ret = __bch2_trans_do(trans, NULL, NULL, 0, ++ bch2_mark_key(trans, old, *k, flags)); +fsck_err: +err: + if (ret) @@ -13748,8 +14535,7 @@ index 000000000000..091bddee575d + return ret; +} + -+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale, -+ bool initial) ++static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) +{ + struct bch_fs *c = trans->c; + struct btree_node_iter iter; @@ -13758,8 +14544,6 @@ index 000000000000..091bddee575d + struct bkey_buf prev, cur; + int ret = 0; + -+ *max_stale = 0; -+ + if (!btree_node_type_needs_gc(btree_node_type(b))) + return 0; + @@ -13770,7 +14554,7 @@ index 000000000000..091bddee575d + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, -+ &k, max_stale, initial); ++ &k, initial); + if (ret) + break; + @@ -13801,7 +14585,6 @@ index 000000000000..091bddee575d + : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; -+ u8 max_stale = 0; + int ret = 0; + + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); @@ -13812,21 +14595,9 @@ index 000000000000..091bddee575d + + gc_pos_set(c, gc_pos_btree_node(b)); + -+ ret = btree_gc_mark_node(trans, b, &max_stale, initial); ++ ret = btree_gc_mark_node(trans, b, initial); + if (ret) + break; -+ -+ if (!initial) { -+ if (max_stale > 64) -+ bch2_btree_node_rewrite(trans, &iter, b, -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_GC_LOCK_HELD); -+ else if (!bch2_btree_gc_rewrite_disabled && -+ (bch2_btree_gc_always_rewrite || max_stale > 16)) -+ bch2_btree_node_rewrite(trans, &iter, -+ b, BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_GC_LOCK_HELD); -+ } + } + bch2_trans_iter_exit(trans, &iter); + @@ -13838,8 +14609,8 @@ index 000000000000..091bddee575d + if (!btree_node_fake(b)) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + -+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, -+ &k, &max_stale, initial); ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, ++ true, &k, initial); + } + gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); + mutex_unlock(&c->btree_root_lock); @@ -13854,8 +14625,7 @@ index 000000000000..091bddee575d + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; -+ u8 max_stale = 0; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); @@ -13867,8 +14637,8 @@ index 000000000000..091bddee575d + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + -+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, -+ &k, &max_stale, true); ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, ++ false, &k, true); + if (ret) { + bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); + goto fsck_err; @@ -13916,7 +14686,8 @@ index 000000000000..091bddee575d + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) && ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + ret = FSCK_ERR_START_TOPOLOGY_REPAIR; + bch_info(c, "Halting mark and sweep to start topology repair pass"); @@ -13946,6 +14717,7 @@ index 000000000000..091bddee575d + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + bch2_btree_and_journal_iter_exit(&iter); ++ printbuf_exit(&buf); + return ret; +} + @@ -13959,8 +14731,7 @@ index 000000000000..091bddee575d + : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; -+ u8 max_stale = 0; -+ char buf[100]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + b = c->btree_roots[btree_id].b; @@ -13969,17 +14740,19 @@ index 000000000000..091bddee575d + return 0; + + six_lock_read(&b->c.lock, NULL, NULL); ++ printbuf_reset(&buf); ++ bch2_bpos_to_text(&buf, b->data->min_key); + if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, -+ "btree root with incorrect min_key: %s", -+ (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { ++ "btree root with incorrect min_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = FSCK_ERR_EXIT; + goto fsck_err; + } + ++ printbuf_reset(&buf); ++ bch2_bpos_to_text(&buf, b->data->max_key); + if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, -+ "btree root with incorrect max_key: %s", -+ (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { ++ "btree root with incorrect max_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = FSCK_ERR_EXIT; + goto fsck_err; @@ -13992,13 +14765,14 @@ index 000000000000..091bddee575d + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, -+ &k, &max_stale, true); ++ &k, true); + } +fsck_err: + six_unlock_read(&b->c.lock); + + if (ret < 0) + bch_err(c, "%s: ret %i", __func__, ret); ++ printbuf_exit(&buf); + return ret; +} + @@ -14017,6 +14791,9 @@ index 000000000000..091bddee575d + + bch2_trans_init(&trans, c, 0, 0); + ++ if (initial) ++ trans.is_initial_gc = true; ++ + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); @@ -14051,23 +14828,13 @@ index 000000000000..091bddee575d + } while (start < end); +} + -+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, -+ unsigned flags) ++static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ++ unsigned flags) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + unsigned i; + u64 b; + -+ /* -+ * This conditional is kind of gross, but we may be called from the -+ * device add path, before the new device has actually been added to the -+ * running filesystem: -+ */ -+ if (c) { -+ lockdep_assert_held(&c->sb_lock); -+ percpu_down_read(&c->mark_lock); -+ } -+ + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + @@ -14086,9 +14853,6 @@ index 000000000000..091bddee575d + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), flags); + } -+ -+ if (c) -+ percpu_up_read(&c->mark_lock); +} + +static void bch2_mark_superblocks(struct bch_fs *c) @@ -14127,13 +14891,14 @@ index 000000000000..091bddee575d + struct bch_dev *ca; + unsigned i; + -+ genradix_free(&c->stripes[1]); ++ genradix_free(&c->reflink_gc_table); ++ genradix_free(&c->gc_stripes); + + for_each_member_device(ca, c, i) { -+ kvpfree(rcu_dereference_protected(ca->buckets[1], 1), ++ kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); -+ ca->buckets[1] = NULL; ++ ca->buckets_gc = NULL; + + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; @@ -14147,18 +14912,20 @@ index 000000000000..091bddee575d + bool initial, bool metadata_only) +{ + struct bch_dev *ca = NULL; ++ struct printbuf buf = PRINTBUF; + bool verify = !metadata_only && (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; + int ret = 0; + ++ percpu_down_write(&c->mark_lock); ++ +#define copy_field(_f, _msg, ...) \ + if (dst->_f != src->_f) { \ + if (verify) \ + fsck_err(c, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f); \ + dst->_f = src->_f; \ -+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } +#define copy_stripe_field(_f, _msg, ...) \ + if (dst->_f != src->_f) { \ @@ -14168,85 +14935,28 @@ index 000000000000..091bddee575d + iter.pos, ##__VA_ARGS__, \ + dst->_f, src->_f); \ + dst->_f = src->_f; \ -+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ -+ } -+#define copy_bucket_field(_f) \ -+ if (dst->b[b].mark._f != src->b[b].mark._f) { \ -+ if (verify) \ -+ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ -+ ": got %u, should be %u", dev, b, \ -+ dst->b[b].mark.gen, \ -+ bch2_data_types[dst->b[b].mark.data_type],\ -+ dst->b[b].mark._f, src->b[b].mark._f); \ -+ dst->b[b]._mark._f = src->b[b].mark._f; \ -+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ + } +#define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) +#define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + -+ if (!metadata_only) { -+ struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); -+ struct stripe *dst, *src; -+ -+ while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) { -+ dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL); -+ -+ if (dst->alive != src->alive || -+ dst->sectors != src->sectors || -+ dst->algorithm != src->algorithm || -+ dst->nr_blocks != src->nr_blocks || -+ dst->nr_redundant != src->nr_redundant) { -+ bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused"); -+ ret = -EINVAL; -+ goto fsck_err; -+ } -+ -+ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) -+ copy_stripe_field(block_sectors[i], -+ "block_sectors[%u]", i); -+ -+ dst->blocks_nonempty = 0; -+ for (i = 0; i < dst->nr_blocks; i++) -+ dst->blocks_nonempty += dst->block_sectors[i] != 0; -+ -+ genradix_iter_advance(&iter, &c->stripes[1]); -+ } -+ } -+ + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { -+ struct bucket_array *dst = __bucket_array(ca, 0); -+ struct bucket_array *src = __bucket_array(ca, 1); -+ size_t b; ++ struct bch_dev_usage *dst = ca->usage_base; ++ struct bch_dev_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) ca->usage_gc, ++ dev_usage_u64s()); + -+ for (b = 0; b < src->nbuckets; b++) { -+ copy_bucket_field(gen); -+ copy_bucket_field(data_type); -+ copy_bucket_field(stripe); -+ copy_bucket_field(dirty_sectors); -+ copy_bucket_field(cached_sectors); ++ copy_dev_field(buckets_ec, "buckets_ec"); ++ copy_dev_field(buckets_unavailable, "buckets_unavailable"); + -+ dst->b[b].oldest_gen = src->b[b].oldest_gen; -+ } -+ -+ { -+ struct bch_dev_usage *dst = ca->usage_base; -+ struct bch_dev_usage *src = (void *) -+ bch2_acc_percpu_u64s((void *) ca->usage_gc, -+ dev_usage_u64s()); -+ -+ copy_dev_field(buckets_ec, "buckets_ec"); -+ copy_dev_field(buckets_unavailable, "buckets_unavailable"); -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); -+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); -+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); -+ } ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); ++ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); ++ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + }; + @@ -14273,22 +14983,21 @@ index 000000000000..091bddee575d + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); -+ char buf[80]; + + if (metadata_only && + (e->data_type == BCH_DATA_user || + e->data_type == BCH_DATA_cached)) + continue; + -+ bch2_replicas_entry_to_text(&PBUF(buf), e); ++ printbuf_reset(&buf); ++ bch2_replicas_entry_to_text(&buf, e); + -+ copy_fs_field(replicas[i], "%s", buf); ++ copy_fs_field(replicas[i], "%s", buf.buf); + } + } + +#undef copy_fs_field +#undef copy_dev_field -+#undef copy_bucket_field +#undef copy_stripe_field +#undef copy_field +fsck_err: @@ -14296,6 +15005,9 @@ index 000000000000..091bddee575d + percpu_ref_put(&ca->ref); + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); ++ ++ percpu_up_write(&c->mark_lock); ++ printbuf_exit(&buf); + return ret; +} + @@ -14304,7 +15016,6 @@ index 000000000000..091bddee575d +{ + struct bch_dev *ca = NULL; + unsigned i; -+ int ret; + + BUG_ON(c->usage_gc); + @@ -14316,18 +15027,9 @@ index 000000000000..091bddee575d + } + + for_each_member_device(ca, c, i) { -+ BUG_ON(ca->buckets[1]); ++ BUG_ON(ca->buckets_gc); + BUG_ON(ca->usage_gc); + -+ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket), -+ GFP_KERNEL|__GFP_ZERO); -+ if (!ca->buckets[1]) { -+ percpu_ref_put(&ca->ref); -+ bch_err(c, "error allocating ca->buckets[gc]"); -+ return -ENOMEM; -+ } -+ + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); @@ -14336,110 +15038,215 @@ index 000000000000..091bddee575d + } + } + -+ ret = bch2_ec_mem_alloc(c, true); -+ if (ret) { -+ bch_err(c, "error allocating ec gc mem"); -+ return ret; -+ } -+ -+ percpu_down_write(&c->mark_lock); -+ -+ /* -+ * indicate to stripe code that we need to allocate for the gc stripes -+ * radix tree, too -+ */ -+ gc_pos_set(c, gc_phase(GC_PHASE_START)); -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *dst = __bucket_array(ca, 1); -+ struct bucket_array *src = __bucket_array(ca, 0); -+ size_t b; -+ -+ dst->first_bucket = src->first_bucket; -+ dst->nbuckets = src->nbuckets; -+ -+ for (b = 0; b < src->nbuckets; b++) { -+ struct bucket *d = &dst->b[b]; -+ struct bucket *s = &src->b[b]; -+ -+ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; -+ d->gen_valid = s->gen_valid; -+ -+ if (metadata_only && -+ (s->mark.data_type == BCH_DATA_user || -+ s->mark.data_type == BCH_DATA_cached)) -+ d->_mark = s->mark; -+ } -+ }; -+ -+ percpu_up_write(&c->mark_lock); -+ + return 0; +} + -+static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, -+ struct bkey_s_c k) ++/* returns true if not equal */ ++static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, ++ struct bch_alloc_v4 r) ++{ ++ return l.gen != r.gen || ++ l.oldest_gen != r.oldest_gen || ++ l.data_type != r.data_type || ++ l.dirty_sectors != r.dirty_sectors || ++ l.cached_sectors != r.cached_sectors || ++ l.stripe_redundancy != r.stripe_redundancy || ++ l.stripe != r.stripe; ++} ++ ++static int bch2_alloc_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ bool metadata_only) +{ + struct bch_fs *c = trans->c; -+ struct reflink_gc *r; -+ const __le64 *refcount = bkey_refcount_c(k); -+ char buf[200]; -+ int ret = 0; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); ++ struct bucket gc; ++ struct bkey_s_c k; ++ struct bkey_i_alloc_v4 *a; ++ struct bch_alloc_v4 old, new; ++ int ret; + -+ if (!refcount) ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ bch2_alloc_to_v4(k, &old); ++ new = old; ++ ++ percpu_down_read(&c->mark_lock); ++ gc = *gc_bucket(ca, iter->pos.offset); ++ percpu_up_read(&c->mark_lock); ++ ++ if (metadata_only && ++ gc.data_type != BCH_DATA_sb && ++ gc.data_type != BCH_DATA_journal && ++ gc.data_type != BCH_DATA_btree) + return 0; + -+ r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++); -+ if (!r) -+ return -ENOMEM; ++ if (gen_after(old.gen, gc.gen)) ++ return 0; + -+ if (!r || -+ r->offset != k.k->p.offset || -+ r->size != k.k->size) { -+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); -+ return -EINVAL; -+ } ++#define copy_bucket_field(_f) \ ++ if (fsck_err_on(new._f != gc._f, c, \ ++ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ++ ": got %u, should be %u", \ ++ iter->pos.inode, iter->pos.offset, \ ++ gc.gen, \ ++ bch2_data_types[gc.data_type], \ ++ new._f, gc._f)) \ ++ new._f = gc._f; \ + -+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, -+ "reflink key has wrong refcount:\n" -+ " %s\n" -+ " should be %u", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), -+ r->refcount)) { -+ struct bkey_i *new; ++ copy_bucket_field(gen); ++ copy_bucket_field(data_type); ++ copy_bucket_field(dirty_sectors); ++ copy_bucket_field(cached_sectors); ++ copy_bucket_field(stripe_redundancy); ++ copy_bucket_field(stripe); ++#undef copy_bucket_field + -+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); -+ if (!new) { -+ ret = -ENOMEM; -+ goto fsck_err; -+ } ++ if (!bch2_alloc_v4_cmp(old, new)) ++ return 0; + -+ bkey_reassemble(new, k); ++ a = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ return ret; + -+ if (!r->refcount) { -+ new->k.type = KEY_TYPE_deleted; -+ new->k.size = 0; -+ } else { -+ *bkey_refcount(new) = cpu_to_le64(r->refcount); -+ } ++ a->v = new; + -+ ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); -+ if (ret) -+ kfree(new); -+ } ++ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); +fsck_err: + return ret; +} + -+static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, -+ bool metadata_only) ++static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_member_device(ca, c, i) { ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, ca->mi.first_bucket), ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) ++ break; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW, ++ bch2_alloc_write_key(&trans, &iter, ++ metadata_only)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) { ++ bch_err(c, "error writing alloc info: %i", ret); ++ percpu_ref_put(&ca->ref); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bucket *g; ++ struct bch_alloc_v4 a; ++ unsigned i; ++ int ret; ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO); ++ if (!buckets) { ++ percpu_ref_put(&ca->ref); ++ bch_err(c, "error allocating ca->buckets[gc]"); ++ return -ENOMEM; ++ } ++ ++ buckets->first_bucket = ca->mi.first_bucket; ++ buckets->nbuckets = ca->mi.nbuckets; ++ rcu_assign_pointer(ca->buckets_gc, buckets); ++ }; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ g = gc_bucket(ca, k.k->p.offset); ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ g->gen_valid = 1; ++ g->gen = a.gen; ++ ++ if (metadata_only && ++ (a.data_type == BCH_DATA_user || ++ a.data_type == BCH_DATA_cached || ++ a.data_type == BCH_DATA_parity)) { ++ g->data_type = a.data_type; ++ g->dirty_sectors = a.dirty_sectors; ++ g->cached_sectors = a.cached_sectors; ++ g->stripe = a.stripe; ++ g->stripe_redundancy = a.stripe_redundancy; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error reading alloc info at gc start: %i", ret); ++ ++ return ret; ++} ++ ++static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *buckets = gc_bucket_array(ca); ++ struct bucket *g; ++ ++ for_each_bucket(g, buckets) { ++ if (metadata_only && ++ (g->data_type == BCH_DATA_user || ++ g->data_type == BCH_DATA_cached || ++ g->data_type == BCH_DATA_parity)) ++ continue; ++ g->dirty_sectors = 0; ++ g->cached_sectors = 0; ++ } ++ }; ++} ++ ++static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; + size_t idx = 0; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + if (metadata_only) @@ -14447,14 +15254,6 @@ index 000000000000..091bddee575d + + bch2_trans_init(&trans, c, 0, 0); + -+ if (initial) { -+ c->reflink_gc_idx = 0; -+ -+ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, -+ bch2_gc_reflink_done_initial_fn); -+ goto out; -+ } -+ + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); @@ -14462,7 +15261,7 @@ index 000000000000..091bddee575d + if (!refcount) + continue; + -+ r = genradix_ptr(&c->reflink_gc_table, idx); ++ r = genradix_ptr(&c->reflink_gc_table, idx++); + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { @@ -14475,7 +15274,8 @@ index 000000000000..091bddee575d + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf), + r->refcount)) { + struct bkey_i *new; + @@ -14493,7 +15293,7 @@ index 000000000000..091bddee575d + *bkey_refcount(new) = cpu_to_le64(r->refcount); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); ++ __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); + kfree(new); + + if (ret) @@ -14502,36 +15302,13 @@ index 000000000000..091bddee575d + } +fsck_err: + bch2_trans_iter_exit(&trans, &iter); -+out: -+ genradix_free(&c->reflink_gc_table); + c->reflink_gc_nr = 0; + bch2_trans_exit(&trans); ++ printbuf_exit(&buf); + return ret; +} + -+static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans, -+ struct bkey_s_c k) -+{ -+ -+ struct bch_fs *c = trans->c; -+ struct reflink_gc *r; -+ const __le64 *refcount = bkey_refcount_c(k); -+ -+ if (!refcount) -+ return 0; -+ -+ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, -+ GFP_KERNEL); -+ if (!r) -+ return -ENOMEM; -+ -+ r->offset = k.k->p.offset; -+ r->size = k.k->size; -+ r->refcount = 0; -+ return 0; -+} -+ -+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, ++static int bch2_gc_reflink_start(struct bch_fs *c, + bool metadata_only) +{ + struct btree_trans trans; @@ -14544,15 +15321,8 @@ index 000000000000..091bddee575d + return 0; + + bch2_trans_init(&trans, c, 0, 0); -+ genradix_free(&c->reflink_gc_table); + c->reflink_gc_nr = 0; + -+ if (initial) { -+ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, -+ bch2_gc_reflink_start_initial_fn); -+ goto out; -+ } -+ + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); @@ -14572,11 +15342,88 @@ index 000000000000..091bddee575d + r->refcount = 0; + } + bch2_trans_iter_exit(&trans, &iter); -+out: ++ + bch2_trans_exit(&trans); + return ret; +} + ++static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) ++{ ++ struct genradix_iter iter; ++ struct reflink_gc *r; ++ ++ genradix_for_each(&c->reflink_gc_table, iter, r) ++ r->refcount = 0; ++} ++ ++static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct gc_stripe *m; ++ const struct bch_stripe *s; ++ struct printbuf buf = PRINTBUF; ++ unsigned i; ++ int ret = 0; ++ ++ if (metadata_only) ++ return 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->type != KEY_TYPE_stripe) ++ continue; ++ ++ s = bkey_s_c_to_stripe(k).v; ++ m = genradix_ptr(&c->gc_stripes, k.k->p.offset); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) ++ goto inconsistent; ++ continue; ++inconsistent: ++ if (fsck_err_on(true, c, ++ "stripe has wrong block sector count %u:\n" ++ " %s\n" ++ " should be %u", i, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ m ? m->block_sectors[i] : 0)) { ++ struct bkey_i_stripe *new; ++ ++ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); ++ if (!new) { ++ ret = -ENOMEM; ++ break; ++ } ++ ++ bkey_reassemble(&new->k_i, k); ++ ++ for (i = 0; i < new->v.nr_blocks; i++) ++ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); ++ kfree(new); ++ } ++ } ++fsck_err: ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) ++{ ++ genradix_free(&c->gc_stripes); ++} ++ +/** + * bch2_gc - walk _all_ references to buckets, and recompute them: + * @@ -14597,9 +15444,8 @@ index 000000000000..091bddee575d + */ +int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) +{ -+ struct bch_dev *ca; + u64 start_time = local_clock(); -+ unsigned i, iter = 0; ++ unsigned iter = 0; + int ret; + + lockdep_assert_held(&c->state_lock); @@ -14610,11 +15456,14 @@ index 000000000000..091bddee575d + /* flush interior btree updates: */ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); -+again: ++ + ret = bch2_gc_start(c, metadata_only) ?: -+ bch2_gc_reflink_start(c, initial, metadata_only); ++ bch2_gc_alloc_start(c, metadata_only) ?: ++ bch2_gc_reflink_start(c, metadata_only); + if (ret) + goto out; ++again: ++ gc_pos_set(c, gc_phase(GC_PHASE_START)); + + bch2_mark_superblocks(c); + @@ -14652,39 +15501,40 @@ index 000000000000..091bddee575d + + if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + (!iter && bch2_test_restart_gc)) { ++ if (iter++ > 2) { ++ bch_info(c, "Unable to fix bucket gens, looping"); ++ ret = -EINVAL; ++ goto out; ++ } ++ + /* + * XXX: make sure gens we fixed got saved + */ -+ if (iter++ <= 2) { -+ bch_info(c, "Second GC pass needed, restarting:"); -+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ bch_info(c, "Second GC pass needed, restarting:"); ++ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + -+ percpu_down_write(&c->mark_lock); -+ bch2_gc_free(c); -+ percpu_up_write(&c->mark_lock); -+ /* flush fsck errors, reset counters */ -+ bch2_flush_fsck_errs(c); ++ bch2_gc_stripes_reset(c, metadata_only); ++ bch2_gc_alloc_reset(c, metadata_only); ++ bch2_gc_reflink_reset(c, metadata_only); + -+ goto again; -+ } -+ -+ bch_info(c, "Unable to fix bucket gens, looping"); -+ ret = -EINVAL; ++ /* flush fsck errors, reset counters */ ++ bch2_flush_fsck_errs(c); ++ goto again; + } +out: + if (!ret) { + bch2_journal_block(&c->journal); + -+ percpu_down_write(&c->mark_lock); -+ ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: ++ ret = bch2_gc_stripes_done(c, metadata_only) ?: ++ bch2_gc_reflink_done(c, metadata_only) ?: ++ bch2_gc_alloc_done(c, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); -+ } else { -+ percpu_down_write(&c->mark_lock); + } + ++ percpu_down_write(&c->mark_lock); + /* Indicates that gc is no longer in progress: */ + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + @@ -14697,13 +15547,6 @@ index 000000000000..091bddee575d + bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + + /* -+ * Wake up allocator in case it was waiting for buckets -+ * because of not being able to inc gens -+ */ -+ for_each_member_device(ca, c, i) -+ bch2_wake_allocator(ca); -+ -+ /* + * At startup, allocations can happen directly instead of via the + * allocator thread - issue wakeup in case they blocked on gc_lock: + */ @@ -14719,9 +15562,8 @@ index 000000000000..091bddee575d + percpu_down_read(&c->mark_lock); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, false); + -+ if (gen_after(g->mark.gen, ptr->gen) > 16) { ++ if (ptr_stale(ca, ptr) > 16) { + percpu_up_read(&c->mark_lock); + return true; + } @@ -14729,10 +15571,10 @@ index 000000000000..091bddee575d + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + -+ if (gen_after(g->gc_gen, ptr->gen)) -+ g->gc_gen = ptr->gen; ++ if (gen_after(*gen, ptr->gen)) ++ *gen = ptr->gen; + } + percpu_up_read(&c->mark_lock); + @@ -14743,23 +15585,22 @@ index 000000000000..091bddee575d + * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree + * node pointers currently never have cached pointers that can become stale: + */ -+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) ++static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id) +{ -+ struct btree_trans trans; ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf sk; + int ret = 0, commit_err = 0; + + bch2_bkey_buf_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); + -+ bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, ++ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + -+ while ((bch2_trans_begin(&trans), ++ while ((bch2_trans_begin(trans), + k = bch2_btree_iter_peek(&iter)).k) { + ret = bkey_err(k); + @@ -14775,10 +15616,10 @@ index 000000000000..091bddee575d + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); + + commit_err = -+ bch2_trans_update(&trans, &iter, sk.k, 0) ?: -+ bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_NOFAIL); ++ bch2_trans_update(trans, &iter, sk.k, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_NOFAIL); + if (commit_err == -EINTR) { + commit_err = 0; + continue; @@ -14787,19 +15628,48 @@ index 000000000000..091bddee575d + + bch2_btree_iter_advance(&iter); + } -+ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_iter_exit(trans, &iter); + -+ bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +} + ++static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); ++ struct bkey_s_c k; ++ struct bch_alloc_v4 a; ++ struct bkey_i_alloc_v4 *a_mut; ++ int ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (a.oldest_gen == ca->oldest_gen[iter->pos.offset]) ++ return 0; ++ ++ a_mut = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a_mut); ++ if (ret) ++ return ret; ++ ++ a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; ++ ++ return bch2_trans_update(trans, iter, &a_mut->k_i, 0); ++} ++ +int bch2_gc_gens(struct bch_fs *c) +{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; + struct bch_dev *ca; -+ struct bucket_array *buckets; -+ struct bucket *g; ++ u64 b, start_time = local_clock(); + unsigned i; + int ret; + @@ -14808,43 +15678,69 @@ index 000000000000..091bddee575d + * introduces a deadlock in the RO path - we currently take the state + * lock at the start of going RO, thus the gc thread may get stuck: + */ ++ if (!mutex_trylock(&c->gc_gens_lock)) ++ return 0; ++ + down_read(&c->gc_lock); ++ bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); ++ struct bucket_gens *gens; + -+ for_each_bucket(g, buckets) -+ g->gc_gen = g->mark.gen; -+ up_read(&ca->bucket_lock); ++ BUG_ON(ca->oldest_gen); ++ ++ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); ++ if (!ca->oldest_gen) { ++ percpu_ref_put(&ca->ref); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ gens = bucket_gens(ca); ++ ++ for (b = gens->first_bucket; ++ b < gens->nbuckets; b++) ++ ca->oldest_gen[b] = gens->b[b]; + } + + for (i = 0; i < BTREE_ID_NR; i++) + if ((1 << i) & BTREE_ID_HAS_PTRS) { + c->gc_gens_btree = i; + c->gc_gens_pos = POS_MIN; -+ ret = bch2_gc_btree_gens(c, i); ++ ret = bch2_gc_btree_gens(&trans, i); + if (ret) { + bch_err(c, "error recalculating oldest_gen: %i", ret); + goto err; + } + } + -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->oldest_gen = g->gc_gen; -+ up_read(&ca->bucket_lock); ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ bch2_alloc_write_oldest_gen(&trans, &iter)); ++ if (ret) { ++ bch_err(c, "error writing oldest_gen: %i", ret); ++ break; ++ } + } ++ bch2_trans_iter_exit(&trans, &iter); + + c->gc_gens_btree = 0; + c->gc_gens_pos = POS_MIN; + + c->gc_count++; ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); +err: ++ for_each_member_device(ca, c, i) { ++ kvfree(ca->oldest_gen); ++ ca->oldest_gen = NULL; ++ } ++ ++ bch2_trans_exit(&trans); + up_read(&c->gc_lock); ++ mutex_unlock(&c->gc_gens_lock); + return ret; +} + @@ -14938,10 +15834,10 @@ index 000000000000..091bddee575d +} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h new file mode 100644 -index 000000000000..59dfb069e699 +index 000000000000..0665f5941fcc --- /dev/null +++ b/fs/bcachefs/btree_gc.h -@@ -0,0 +1,106 @@ +@@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_GC_H +#define _BCACHEFS_BTREE_GC_H @@ -14952,7 +15848,6 @@ index 000000000000..59dfb069e699 +int bch2_gc_gens(struct bch_fs *); +void bch2_gc_thread_stop(struct bch_fs *); +int bch2_gc_thread_start(struct bch_fs *); -+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); + +/* + * For concurrent mark and sweep (with other index updates), we define a total @@ -15050,10 +15945,10 @@ index 000000000000..59dfb069e699 +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 -index 000000000000..f11fcab61902 +index 000000000000..4b880ea59cad --- /dev/null +++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,2124 @@ +@@ -0,0 +1,2111 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -15447,16 +16342,10 @@ index 000000000000..f11fcab61902 + + bch2_btree_node_iter_init_from_start(&src_iter, src); + -+ if (btree_node_is_extents(src)) -+ nr = bch2_sort_repack_merge(c, btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); -+ else -+ nr = bch2_sort_repack(btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); ++ nr = bch2_sort_repack(btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); @@ -15539,7 +16428,7 @@ index 000000000000..f11fcab61902 + }; + + if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { -+ bch2_btree_node_write(c, b, SIX_LOCK_write); ++ bch2_btree_node_write(c, b, SIX_LOCK_write, 0); + reinit_iter = true; + } + } @@ -15602,13 +16491,7 @@ index 000000000000..f11fcab61902 +#define btree_err(type, c, ca, b, i, msg, ...) \ +({ \ + __label__ out; \ -+ char _buf[300]; \ -+ char *_buf2 = _buf; \ -+ struct printbuf out = PBUF(_buf); \ -+ \ -+ _buf2 = kmalloc(4096, GFP_ATOMIC); \ -+ if (_buf2) \ -+ out = _PBUF(_buf2, 4986); \ ++ struct printbuf out = PRINTBUF; \ + \ + btree_err_msg(&out, c, ca, b, i, b->written, write); \ + pr_buf(&out, ": " msg, ##__VA_ARGS__); \ @@ -15616,13 +16499,13 @@ index 000000000000..f11fcab61902 + if (type == BTREE_ERR_FIXABLE && \ + write == READ && \ + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ -+ mustfix_fsck_err(c, "%s", _buf2); \ ++ mustfix_fsck_err(c, "%s", out.buf); \ + goto out; \ + } \ + \ + switch (write) { \ + case READ: \ -+ bch_err(c, "%s", _buf2); \ ++ bch_err(c, "%s", out.buf); \ + \ + switch (type) { \ + case BTREE_ERR_FIXABLE: \ @@ -15643,7 +16526,7 @@ index 000000000000..f11fcab61902 + } \ + break; \ + case WRITE: \ -+ bch_err(c, "corrupt metadata before write: %s", _buf2); \ ++ bch_err(c, "corrupt metadata before write: %s", out.buf);\ + \ + if (bch2_fs_inconsistent(c)) { \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ @@ -15652,8 +16535,7 @@ index 000000000000..f11fcab61902 + break; \ + } \ +out: \ -+ if (_buf2 != _buf) \ -+ kfree(_buf2); \ ++ printbuf_exit(&out); \ + true; \ +}) + @@ -15714,8 +16596,8 @@ index 000000000000..f11fcab61902 +{ + unsigned version = le16_to_cpu(i->version); + const char *err; -+ char buf1[100]; -+ char buf2[100]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + int ret = 0; + + btree_err_on((version != BCH_BSET_VERSION_OLD && @@ -15748,11 +16630,12 @@ index 000000000000..f11fcab61902 + BTREE_ERR_FATAL, c, ca, b, i, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + -+ if (btree_err_on(offset + sectors > c->opts.btree_node_size, ++ if (btree_err_on(offset + sectors > btree_sectors(c), + BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { + i->u64s = 0; -+ return 0; ++ ret = 0; ++ goto out; + } + + btree_err_on(offset && !i->u64s, @@ -15803,14 +16686,17 @@ index 000000000000..f11fcab61902 + btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect min_key: got %s should be %s", -+ (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), -+ (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); ++ (printbuf_reset(&buf1), ++ bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), ++ (printbuf_reset(&buf2), ++ bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); + } + + btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect max key %s", -+ (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); ++ (printbuf_reset(&buf1), ++ bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); + + if (write) + compat_btree_node(b->c.level, b->c.btree_id, version, @@ -15825,7 +16711,10 @@ index 000000000000..f11fcab61902 + BSET_BIG_ENDIAN(i), write, + &bn->format); + } ++out: +fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; +} + @@ -15835,6 +16724,8 @@ index 000000000000..f11fcab61902 +{ + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + int ret = 0; @@ -15873,11 +16764,10 @@ index 000000000000..f11fcab61902 + (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?: + (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); + if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ printbuf_reset(&buf1); ++ bch2_bkey_val_to_text(&buf1, c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, -+ "invalid bkey: %s\n%s", invalid, buf); ++ "invalid bkey: %s\n%s", invalid, buf1.buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), @@ -15891,18 +16781,18 @@ index 000000000000..f11fcab61902 + &b->format, k); + + if (prev && bkey_iter_cmp(b, prev, k) > 0) { -+ char buf1[80]; -+ char buf2[80]; + struct bkey up = bkey_unpack_key(b, prev); + -+ bch2_bkey_to_text(&PBUF(buf1), &up); -+ bch2_bkey_to_text(&PBUF(buf2), u.k); ++ printbuf_reset(&buf1); ++ bch2_bkey_to_text(&buf1, &up); ++ printbuf_reset(&buf2); ++ bch2_bkey_to_text(&buf2, u.k); + + bch2_dump_bset(c, b, i, 0); + + if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "keys out of order: %s > %s", -+ buf1, buf2)) { ++ buf1.buf, buf2.buf)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); @@ -15914,6 +16804,8 @@ index 000000000000..f11fcab61902 + k = bkey_next(k); + } +fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; +} + @@ -15946,11 +16838,12 @@ index 000000000000..f11fcab61902 + + btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, -+ "bad magic"); ++ "bad magic: want %llx, got %llx", ++ bset_magic(c), le64_to_cpu(b->data->magic)); + + btree_err_on(!b->data->keys.seq, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, -+ "bad btree header"); ++ "bad btree header: seq 0"); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = @@ -15962,7 +16855,7 @@ index 000000000000..f11fcab61902 + b->data->keys.seq, bp->seq); + } + -+ while (b->written < (ptr_written ?: c->opts.btree_node_size)) { ++ while (b->written < (ptr_written ?: btree_sectors(c))) { + unsigned sectors, whiteout_u64s = 0; + struct nonce nonce; + struct bch_csum csum; @@ -15983,9 +16876,12 @@ index 000000000000..f11fcab61902 + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); + -+ bset_encrypt(c, i, b->written << 9); ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting btree node: %i", ret)) ++ goto fsck_err; + -+ btree_err_on(btree_node_is_extents(b) && ++ btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), + BTREE_ERR_FATAL, c, NULL, b, NULL, + "btree node does not have NEW_EXTENT_OVERWRITE set"); @@ -16010,7 +16906,10 @@ index 000000000000..f11fcab61902 + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); + -+ bset_encrypt(c, i, b->written << 9); ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting btree node: %i\n", ret)) ++ goto fsck_err; + + sectors = vstruct_sectors(bne, c->block_bits); + } @@ -16033,19 +16932,23 @@ index 000000000000..f11fcab61902 + + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + -+ b->written += sectors; -+ + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); + + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, ca, b, i, -+ "first btree node bset has blacklisted journal seq"); ++ "first btree node bset has blacklisted journal seq (%llu)", ++ le64_to_cpu(i->journal_seq)); + + btree_err_on(blacklisted && ptr_written, + BTREE_ERR_FIXABLE, c, ca, b, i, -+ "found blacklisted bset in btree node with sectors_written"); ++ "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", ++ le64_to_cpu(i->journal_seq), ++ b->written, b->written + sectors, ptr_written); ++ ++ b->written += sectors; ++ + if (blacklisted && !first) + continue; + @@ -16118,11 +17021,12 @@ index 000000000000..f11fcab61902 + if (invalid || + (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { -+ char buf[160]; ++ struct printbuf buf = PRINTBUF; + -+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ bch2_bkey_val_to_text(&buf, c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, -+ "invalid bkey %s: %s", buf, invalid); ++ "invalid bkey %s: %s", buf.buf, invalid); ++ printbuf_exit(&buf); + + btree_keys_account_key_drop(&b->nr, 0, k); + @@ -16179,8 +17083,7 @@ index 000000000000..f11fcab61902 + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bio *bio = &rb->bio; + struct bch_io_failures failed = { .nr = 0 }; -+ char buf[200]; -+ struct printbuf out; ++ struct printbuf buf = PRINTBUF; + bool saw_error = false; + bool can_retry; + @@ -16201,10 +17104,10 @@ index 000000000000..f11fcab61902 + bio->bi_status = BLK_STS_REMOVED; + } +start: -+ out = PBUF(buf); -+ btree_pos_to_text(&out, c, b); ++ printbuf_reset(&buf); ++ btree_pos_to_text(&buf, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", -+ bch2_blk_status_to_str(bio->bi_status), buf); ++ bch2_blk_status_to_str(bio->bi_status), buf.buf); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; @@ -16230,6 +17133,7 @@ index 000000000000..f11fcab61902 + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); ++ printbuf_exit(&buf); + + if (saw_error && !btree_node_read_error(b)) + bch2_btree_node_rewrite_async(c, b); @@ -16271,7 +17175,7 @@ index 000000000000..f11fcab61902 + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return 0; + -+ while (offset < c->opts.btree_node_size) { ++ while (offset < btree_sectors(c)) { + if (!offset) { + offset += vstruct_sectors(bn, c->block_bits); + } else { @@ -16293,7 +17197,7 @@ index 000000000000..f11fcab61902 + if (!offset) + return false; + -+ while (offset < c->opts.btree_node_size) { ++ while (offset < btree_sectors(c)) { + bne = data + (offset << 9); + if (bne->keys.seq == bn->keys.seq) + return true; @@ -16310,6 +17214,7 @@ index 000000000000..f11fcab61902 + container_of(cl, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; ++ struct printbuf buf = PRINTBUF; + bool dump_bset_maps = false; + bool have_retry = false; + int ret = 0, best = -1, write = READ; @@ -16353,8 +17258,6 @@ index 000000000000..f11fcab61902 +fsck_err: + if (dump_bset_maps) { + for (i = 0; i < ra->nr; i++) { -+ char buf[200]; -+ struct printbuf out = PBUF(buf); + struct btree_node *bn = ra->buf[i]; + struct btree_node_entry *bne = NULL; + unsigned offset = 0, sectors; @@ -16363,7 +17266,9 @@ index 000000000000..f11fcab61902 + if (ra->err[i]) + continue; + -+ while (offset < c->opts.btree_node_size) { ++ printbuf_reset(&buf); ++ ++ while (offset < btree_sectors(c)) { + if (!offset) { + sectors = vstruct_sectors(bn, c->block_bits); + } else { @@ -16373,30 +17278,30 @@ index 000000000000..f11fcab61902 + sectors = vstruct_sectors(bne, c->block_bits); + } + -+ pr_buf(&out, " %u-%u", offset, offset + sectors); ++ pr_buf(&buf, " %u-%u", offset, offset + sectors); + if (bne && bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) -+ pr_buf(&out, "*"); ++ pr_buf(&buf, "*"); + offset += sectors; + } + -+ while (offset < c->opts.btree_node_size) { ++ while (offset < btree_sectors(c)) { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq == bn->keys.seq) { + if (!gap) -+ pr_buf(&out, " GAP"); ++ pr_buf(&buf, " GAP"); + gap = true; + + sectors = vstruct_sectors(bne, c->block_bits); -+ pr_buf(&out, " %u-%u", offset, offset + sectors); ++ pr_buf(&buf, " %u-%u", offset, offset + sectors); + if (bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) -+ pr_buf(&out, "*"); ++ pr_buf(&buf, "*"); + } + offset++; + } + -+ bch_err(c, "replica %u:%s", i, buf); ++ bch_err(c, "replica %u:%s", i, buf.buf); + } + } + @@ -16417,6 +17322,7 @@ index 000000000000..f11fcab61902 + + closure_debug_destroy(&ra->cl); + kfree(ra); ++ printbuf_exit(&buf); + + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -16516,23 +17422,23 @@ index 000000000000..f11fcab61902 + struct btree_read_bio *rb; + struct bch_dev *ca; + struct bio *bio; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret; + -+ btree_pos_to_text(&PBUF(buf), c, b); ++ btree_pos_to_text(&buf, c, b); + trace_btree_read(c, b); + + if (bch2_verify_all_btree_replicas && + !btree_node_read_all_replicas(c, b, sync)) -+ return; ++ goto out; + + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); + if (bch2_fs_fatal_err_on(ret <= 0, c, + "btree node read error: no device to read from\n" -+ " at %s", buf)) { ++ " at %s", buf.buf)) { + set_btree_node_read_error(b); -+ return; ++ goto out; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); @@ -16573,6 +17479,8 @@ index 000000000000..f11fcab61902 + else + queue_work(c->io_complete_wq, &rb->work); + } ++out: ++ printbuf_exit(&buf); +} + +int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, @@ -16589,7 +17497,7 @@ index 000000000000..f11fcab61902 + closure_sync(&cl); + } while (ret); + -+ b = bch2_btree_node_mem_alloc(c); ++ b = bch2_btree_node_mem_alloc(c, level != 0); + bch2_btree_cache_cannibalize_unlock(c); + + BUG_ON(IS_ERR(b)); @@ -16639,7 +17547,7 @@ index 000000000000..f11fcab61902 + bch2_journal_pin_drop(&c->journal, &w->journal); +} + -+static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++static void __btree_node_write_done(struct bch_fs *c, struct btree *b) +{ + struct btree_write *w = btree_prev_write(b); + unsigned long old, new, v; @@ -16650,26 +17558,11 @@ index 000000000000..f11fcab61902 + do { + old = new = v; + -+ if (old & (1U << BTREE_NODE_need_write)) -+ goto do_write; -+ -+ new &= ~(1U << BTREE_NODE_write_in_flight); -+ new &= ~(1U << BTREE_NODE_write_in_flight_inner); -+ } while ((v = cmpxchg(&b->flags, old, new)) != old); -+ -+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -+ return; -+ -+do_write: -+ six_lock_read(&b->c.lock, NULL, NULL); -+ v = READ_ONCE(b->flags); -+ do { -+ old = new = v; -+ + if ((old & (1U << BTREE_NODE_dirty)) && + (old & (1U << BTREE_NODE_need_write)) && + !(old & (1U << BTREE_NODE_never_write)) && -+ btree_node_may_write(b)) { ++ !(old & (1U << BTREE_NODE_write_blocked)) && ++ !(old & (1U << BTREE_NODE_will_make_reachable))) { + new &= ~(1U << BTREE_NODE_dirty); + new &= ~(1U << BTREE_NODE_need_write); + new |= (1U << BTREE_NODE_write_in_flight); @@ -16683,8 +17576,15 @@ index 000000000000..f11fcab61902 + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + if (new & (1U << BTREE_NODE_write_in_flight)) -+ __bch2_btree_node_write(c, b, true); ++ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); ++ else ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++} + ++static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++{ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); +} + @@ -16799,7 +17699,7 @@ index 000000000000..f11fcab61902 + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); +} + -+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started) ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) +{ + struct btree_write_bio *wbio; + struct bset_tree *t; @@ -16814,13 +17714,11 @@ index 000000000000..f11fcab61902 + unsigned long old, new; + bool validate_before_checksum = false; + void *data; ++ int ret; + -+ if (already_started) ++ if (flags & BTREE_WRITE_ALREADY_STARTED) + goto do_write; + -+ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) -+ return; -+ + /* + * We may only have a read lock on the btree node - the dirty bit is our + * "lock" against racing with other threads that may be trying to start @@ -16834,13 +17732,21 @@ index 000000000000..f11fcab61902 + if (!(old & (1 << BTREE_NODE_dirty))) + return; + -+ if (!btree_node_may_write(b)) ++ if ((flags & BTREE_WRITE_ONLY_IF_NEED) && ++ !(old & (1 << BTREE_NODE_need_write))) + return; + -+ if (old & (1 << BTREE_NODE_never_write)) ++ if (old & ++ ((1 << BTREE_NODE_never_write)| ++ (1 << BTREE_NODE_write_blocked))) + return; + -+ BUG_ON(old & (1 << BTREE_NODE_write_in_flight)); ++ if (b->written && ++ (old & (1 << BTREE_NODE_will_make_reachable))) ++ return; ++ ++ if (old & (1 << BTREE_NODE_write_in_flight)) ++ return; + + new &= ~(1 << BTREE_NODE_dirty); + new &= ~(1 << BTREE_NODE_need_write); @@ -16858,8 +17764,8 @@ index 000000000000..f11fcab61902 + BUG_ON(btree_node_fake(b)); + BUG_ON((b->will_make_reachable != 0) != !b->written); + -+ BUG_ON(b->written >= c->opts.btree_node_size); -+ BUG_ON(b->written & (c->opts.block_size - 1)); ++ BUG_ON(b->written >= btree_sectors(c)); ++ BUG_ON(b->written & (block_sectors(c) - 1)); + BUG_ON(bset_written(b, btree_bset_last(b))); + BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); + BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); @@ -16932,11 +17838,11 @@ index 000000000000..f11fcab61902 + memset(data + bytes_to_write, 0, + (sectors_to_write << 9) - bytes_to_write); + -+ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); ++ BUG_ON(b->written + sectors_to_write > btree_sectors(c)); + BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); + BUG_ON(i->seq != b->data->keys.seq); + -+ i->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber + ? cpu_to_le16(BCH_BSET_VERSION_OLD) + : cpu_to_le16(c->sb.version); + SET_BSET_OFFSET(i, b->written); @@ -16954,7 +17860,10 @@ index 000000000000..f11fcab61902 + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; + -+ bset_encrypt(c, i, b->written << 9); ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error encrypting btree node: %i\n", ret)) ++ goto err; + + nonce = btree_nonce(i, b->written << 9); + @@ -17037,7 +17946,7 @@ index 000000000000..f11fcab61902 + b->written += sectors_to_write; +nowrite: + btree_bounce_free(c, bytes, used_mempool, data); -+ btree_node_write_done(c, b); ++ __btree_node_write_done(c, b); +} + +/* @@ -17100,12 +18009,13 @@ index 000000000000..f11fcab61902 + * Use this one if the node is intent locked: + */ +void bch2_btree_node_write(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_type_held) ++ enum six_lock_type lock_type_held, ++ unsigned flags) +{ + if (lock_type_held == SIX_LOCK_intent || + (lock_type_held == SIX_LOCK_read && + six_lock_tryupgrade(&b->c.lock))) { -+ __bch2_btree_node_write(c, b, false); ++ __bch2_btree_node_write(c, b, flags); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && @@ -17117,7 +18027,7 @@ index 000000000000..f11fcab61902 + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + } else { -+ __bch2_btree_node_write(c, b, false); ++ __bch2_btree_node_write(c, b, flags); + if (lock_type_held == SIX_LOCK_write && + btree_node_just_written(b)) + bch2_btree_post_write_cleanup(c, b); @@ -17137,7 +18047,6 @@ index 000000000000..f11fcab61902 + rcu_read_unlock(); + wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); + goto restart; -+ + } + rcu_read_unlock(); +} @@ -17151,39 +18060,12 @@ index 000000000000..f11fcab61902 +{ + __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); +} -+ -+void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) { -+ unsigned long flags = READ_ONCE(b->flags); -+ -+ if (!(flags & (1 << BTREE_NODE_dirty))) -+ continue; -+ -+ pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", -+ b, -+ (flags & (1 << BTREE_NODE_dirty)) != 0, -+ (flags & (1 << BTREE_NODE_need_write)) != 0, -+ b->c.level, -+ b->written, -+ !list_empty_careful(&b->write_blocked), -+ b->will_make_reachable != 0, -+ b->will_make_reachable & 1); -+ } -+ rcu_read_unlock(); -+} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h new file mode 100644 -index 000000000000..0f20224e2a77 +index 000000000000..d818d87661e8 --- /dev/null +++ b/fs/bcachefs/btree_io.h -@@ -0,0 +1,248 @@ +@@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_IO_H +#define _BCACHEFS_BTREE_IO_H @@ -17201,18 +18083,13 @@ index 000000000000..0f20224e2a77 +struct btree_iter; +struct btree_node_read_all; + -+static inline bool btree_node_dirty(struct btree *b) -+{ -+ return test_bit(BTREE_NODE_dirty, &b->flags); -+} -+ -+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) ++static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) +{ + if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) + atomic_inc(&c->btree_cache.dirty); +} + -+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) ++static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) +{ + if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) + atomic_dec(&c->btree_cache.dirty); @@ -17253,12 +18130,6 @@ index 000000000000..0f20224e2a77 +void bch2_btree_node_wait_on_read(struct btree *); +void bch2_btree_node_wait_on_write(struct btree *); + -+static inline bool btree_node_may_write(struct btree *b) -+{ -+ return list_empty_careful(&b->write_blocked) && -+ (!b->written || !b->will_make_reachable); -+} -+ +enum compact_mode { + COMPACT_LAZY, + COMPACT_ALL, @@ -17297,22 +18168,25 @@ index 000000000000..0f20224e2a77 + }}; +} + -+static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) ++static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) +{ + struct nonce nonce = btree_nonce(i, offset); ++ int ret; + + if (!offset) { + struct btree_node *bn = container_of(i, struct btree_node, keys); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + -+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, -+ bytes); ++ ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, ++ &bn->flags, bytes); ++ if (ret) ++ return ret; + + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); + } + -+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, -+ vstruct_end(i) - (void *) i->_data); ++ return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, ++ vstruct_end(i) - (void *) i->_data); +} + +void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); @@ -17331,41 +18205,23 @@ index 000000000000..0f20224e2a77 +void bch2_btree_complete_write(struct bch_fs *, struct btree *, + struct btree_write *); + -+void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool); +bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + ++#define BTREE_WRITE_ONLY_IF_NEED (1U << 0) ++#define BTREE_WRITE_ALREADY_STARTED (1U << 1) ++ ++void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); +void bch2_btree_node_write(struct bch_fs *, struct btree *, -+ enum six_lock_type); ++ enum six_lock_type, unsigned); + +static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_held) +{ -+ if (b->written && -+ btree_node_need_write(b) && -+ btree_node_may_write(b) && -+ !btree_node_write_in_flight(b)) -+ bch2_btree_node_write(c, b, lock_held); ++ bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); +} + -+#define bch2_btree_node_write_cond(_c, _b, cond) \ -+do { \ -+ unsigned long old, new, v = READ_ONCE((_b)->flags); \ -+ \ -+ do { \ -+ old = new = v; \ -+ \ -+ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ -+ break; \ -+ \ -+ new |= (1 << BTREE_NODE_need_write); \ -+ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ -+ \ -+ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ -+} while (0) -+ +void bch2_btree_flush_all_reads(struct bch_fs *); +void bch2_btree_flush_all_writes(struct bch_fs *); -+void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); + +static inline void compat_bformat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, @@ -17434,10 +18290,10 @@ index 000000000000..0f20224e2a77 +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 -index 000000000000..f43044e6fa37 +index 000000000000..25d254ee9eac --- /dev/null +++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,2960 @@ +@@ -0,0 +1,3329 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -17452,6 +18308,7 @@ index 000000000000..f43044e6fa37 +#include "error.h" +#include "extents.h" +#include "journal.h" ++#include "recovery.h" +#include "replicas.h" +#include "subvolume.h" + @@ -17459,12 +18316,21 @@ index 000000000000..f43044e6fa37 +#include + +static void btree_trans_verify_sorted(struct btree_trans *); -+static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int); ++inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); + +static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, + struct btree_path *); + ++static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ return iter->ip_allocated; ++#else ++ return 0; ++#endif ++} ++ +static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); + +/* @@ -17488,6 +18354,9 @@ index 000000000000..f43044e6fa37 + struct bpos r_pos, + unsigned r_level) +{ ++ /* ++ * Must match lock ordering as defined by __bch2_btree_node_lock: ++ */ + return cmp_int(l->btree_id, r_btree_id) ?: + cmp_int((int) l->cached, (int) r_cached) ?: + bpos_cmp(l->pos, r_pos) ?: @@ -17586,11 +18455,19 @@ index 000000000000..f43044e6fa37 + * goes to 0, and it's safe because we have the node intent + * locked: + */ -+ atomic64_sub(__SIX_VAL(read_lock, readers), -+ &b->c.lock.state.counter); -+ btree_node_lock_type(trans->c, b, SIX_LOCK_write); -+ atomic64_add(__SIX_VAL(read_lock, readers), -+ &b->c.lock.state.counter); ++ if (!b->c.lock.readers) ++ atomic64_sub(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ else ++ this_cpu_sub(*b->c.lock.readers, readers); ++ ++ six_lock_write(&b->c.lock, NULL, NULL); ++ ++ if (!b->c.lock.readers) ++ atomic64_add(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ else ++ this_cpu_add(*b->c.lock.readers, readers); +} + +bool __bch2_btree_node_relock(struct btree_trans *trans, @@ -17600,19 +18477,25 @@ index 000000000000..f43044e6fa37 + int want = __btree_lock_want(path, level); + + if (!is_btree_node(path, level)) -+ return false; ++ goto fail; + + if (race_fault()) -+ return false; ++ goto fail; + + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, want))) { -+ mark_btree_node_locked(path, level, want); ++ mark_btree_node_locked(trans, path, level, want); + return true; -+ } else { -+ return false; + } ++fail: ++ trace_btree_node_relock_fail(trans->fn, _RET_IP_, ++ path->btree_id, ++ &path->pos, ++ (unsigned long) b, ++ path->l[level].lock_seq, ++ is_btree_node(path, level) ? b->c.lock.state.seq : 0); ++ return false; +} + +bool bch2_btree_node_upgrade(struct btree_trans *trans, @@ -17653,13 +18536,13 @@ index 000000000000..f43044e6fa37 + + return false; +success: -+ mark_btree_node_intent_locked(path, level); ++ mark_btree_node_intent_locked(trans, path, level); + return true; +} + +static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, -+ bool upgrade, unsigned long trace_ip) ++ bool upgrade) +{ + unsigned l = path->level; + int fail_idx = -1; @@ -17716,10 +18599,8 @@ index 000000000000..f43044e6fa37 + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ -+ struct btree_path *linked, *deadlock_path = NULL; -+ u64 start_time = local_clock(); -+ unsigned reason = 9; -+ bool ret; ++ struct btree_path *linked; ++ unsigned reason; + + /* Check if it's safe to block: */ + trans_for_each_path(trans, linked) { @@ -17740,28 +18621,28 @@ index 000000000000..f43044e6fa37 + */ + if (type == SIX_LOCK_intent && + linked->nodes_locked != linked->nodes_intent_locked) { -+ deadlock_path = linked; + reason = 1; ++ goto deadlock; + } + + if (linked->btree_id != path->btree_id) { -+ if (linked->btree_id > path->btree_id) { -+ deadlock_path = linked; -+ reason = 3; -+ } -+ continue; ++ if (linked->btree_id < path->btree_id) ++ continue; ++ ++ reason = 3; ++ goto deadlock; + } + + /* -+ * Within the same btree, cached paths come before non -+ * cached paths: ++ * Within the same btree, non-cached paths come before cached ++ * paths: + */ + if (linked->cached != path->cached) { -+ if (path->cached) { -+ deadlock_path = linked; -+ reason = 4; -+ } -+ continue; ++ if (!linked->cached) ++ continue; ++ ++ reason = 4; ++ goto deadlock; + } + + /* @@ -17770,53 +18651,33 @@ index 000000000000..f43044e6fa37 + * we're about to lock, it must have the ancestors locked too: + */ + if (level > __fls(linked->nodes_locked)) { -+ deadlock_path = linked; + reason = 5; ++ goto deadlock; + } + + /* Must lock btree nodes in key order: */ + if (btree_node_locked(linked, level) && + bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, + linked->cached)) <= 0) { -+ deadlock_path = linked; -+ reason = 7; + BUG_ON(trans->in_traverse_all); ++ reason = 7; ++ goto deadlock; + } + } + -+ if (unlikely(deadlock_path)) { -+ trace_trans_restart_would_deadlock(trans->ip, ip, -+ trans->in_traverse_all, reason, -+ deadlock_path->btree_id, -+ deadlock_path->cached, -+ &deadlock_path->pos, -+ path->btree_id, -+ path->cached, -+ &pos); -+ btree_trans_restart(trans); -+ return false; -+ } -+ -+ if (six_trylock_type(&b->c.lock, type)) -+ return true; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->locking_path_idx = path->idx; -+ trans->locking_pos = pos; -+ trans->locking_btree_id = path->btree_id; -+ trans->locking_level = level; -+ trans->locking = b; -+#endif -+ -+ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->locking = NULL; -+#endif -+ if (ret) -+ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], -+ start_time); -+ return ret; ++ return btree_node_lock_type(trans, path, b, pos, level, ++ type, should_sleep_fn, p); ++deadlock: ++ trace_trans_restart_would_deadlock(trans->fn, ip, ++ trans->in_traverse_all, reason, ++ linked->btree_id, ++ linked->cached, ++ &linked->pos, ++ path->btree_id, ++ path->cached, ++ &pos); ++ btree_trans_restart(trans); ++ return false; +} + +/* Btree iterator locking: */ @@ -17865,6 +18726,8 @@ index 000000000000..f43044e6fa37 + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, ++ path->btree_id, &path->pos); + btree_trans_restart(trans); + return false; + } @@ -17877,10 +18740,13 @@ index 000000000000..f43044e6fa37 +static bool bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ -+ bool ret = btree_path_get_locks(trans, path, false, trace_ip); ++ bool ret = btree_path_get_locks(trans, path, false); + -+ if (!ret) ++ if (!ret) { ++ trace_trans_restart_relock_path(trans->fn, trace_ip, ++ path->btree_id, &path->pos); + btree_trans_restart(trans); ++ } + return ret; +} + @@ -17894,7 +18760,7 @@ index 000000000000..f43044e6fa37 + + path->locks_want = new_locks_want; + -+ if (btree_path_get_locks(trans, path, true, _THIS_IP_)) ++ if (btree_path_get_locks(trans, path, true)) + return true; + + /* @@ -17916,14 +18782,15 @@ index 000000000000..f43044e6fa37 + * before interior nodes - now that's handled by + * bch2_btree_path_traverse_all(). + */ -+ trans_for_each_path(trans, linked) -+ if (linked != path && -+ linked->cached == path->cached && -+ linked->btree_id == path->btree_id && -+ linked->locks_want < new_locks_want) { -+ linked->locks_want = new_locks_want; -+ btree_path_get_locks(trans, linked, true, _THIS_IP_); -+ } ++ if (!path->cached && !trans->in_traverse_all) ++ trans_for_each_path(trans, linked) ++ if (linked != path && ++ linked->cached == path->cached && ++ linked->btree_id == path->btree_id && ++ linked->locks_want < new_locks_want) { ++ linked->locks_want = new_locks_want; ++ btree_path_get_locks(trans, linked, true); ++ } + + return false; +} @@ -17973,7 +18840,7 @@ index 000000000000..f43044e6fa37 + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock(trans, path, _RET_IP_)) { -+ trace_trans_restart_relock(trans->ip, _RET_IP_, ++ trace_trans_restart_relock(trans->fn, _RET_IP_, + path->btree_id, &path->pos); + BUG_ON(!trans->restarted); + return false; @@ -17988,7 +18855,12 @@ index 000000000000..f43044e6fa37 + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(path); + -+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ /* ++ * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking ++ * btree nodes, it implements its own walking: ++ */ ++ BUG_ON(!trans->is_initial_gc && ++ lock_class_is_held(&bch2_btree_node_lock_key)); +} + +/* Btree iterator: */ @@ -18019,7 +18891,9 @@ index 000000000000..f43044e6fa37 + struct btree_node_iter tmp; + bool locked; + struct bkey_packed *p, *k; -+ char buf1[100], buf2[100], buf3[100]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf3 = PRINTBUF; + const char *msg; + + if (!bch2_debug_check_iterators) @@ -18067,26 +18941,27 @@ index 000000000000..f43044e6fa37 + btree_node_unlock(path, level); + return; +err: -+ strcpy(buf2, "(none)"); -+ strcpy(buf3, "(none)"); -+ -+ bch2_bpos_to_text(&PBUF(buf1), path->pos); ++ bch2_bpos_to_text(&buf1, path->pos); + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); -+ bch2_bkey_to_text(&PBUF(buf2), &uk); ++ bch2_bkey_to_text(&buf2, &uk); ++ } else { ++ pr_buf(&buf2, "(none)"); + } + + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); -+ bch2_bkey_to_text(&PBUF(buf3), &uk); ++ bch2_bkey_to_text(&buf3, &uk); ++ } else { ++ pr_buf(&buf3, "(none)"); + } + + panic("path should be %s key at level %u:\n" + "path pos %s\n" + "prev key %s\n" + "cur key %s\n", -+ msg, level, buf1, buf2, buf3); ++ msg, level, buf1.buf, buf2.buf, buf3.buf); +} + +static void bch2_btree_path_verify(struct btree_trans *trans, @@ -18126,9 +19001,6 @@ index 000000000000..f43044e6fa37 + + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + -+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && -+ iter->pos.snapshot != iter->snapshot); -+ + BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + @@ -18136,6 +19008,8 @@ index 000000000000..f43044e6fa37 + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + ++ if (iter->update_path) ++ bch2_btree_path_verify(trans, iter->update_path); + bch2_btree_path_verify(trans, iter->path); +} + @@ -18172,6 +19046,7 @@ index 000000000000..f43044e6fa37 + k.k->p.snapshot)); + + bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, ++ BTREE_ITER_NOPRESERVE| + BTREE_ITER_ALL_SNAPSHOTS); + prev = bch2_btree_iter_prev(©); + if (!prev.k) @@ -18184,16 +19059,16 @@ index 000000000000..f43044e6fa37 + if (!bkey_cmp(prev.k->p, k.k->p) && + bch2_snapshot_is_ancestor(trans->c, iter->snapshot, + prev.k->p.snapshot) > 0) { -+ char buf1[100], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + -+ bch2_bkey_to_text(&PBUF(buf1), k.k); -+ bch2_bkey_to_text(&PBUF(buf2), prev.k); ++ bch2_bkey_to_text(&buf1, k.k); ++ bch2_bkey_to_text(&buf2, prev.k); + + panic("iter snap %u\n" + "k %s\n" + "prev %s\n", + iter->snapshot, -+ buf1, buf2); ++ buf1.buf, buf2.buf); + } +out: + bch2_trans_iter_exit(trans, ©); @@ -18205,7 +19080,7 @@ index 000000000000..f43044e6fa37 +{ + struct btree_path *path; + unsigned idx; -+ char buf[100]; ++ struct printbuf buf = PRINTBUF; + + trans_for_each_path_inorder(trans, path, idx) { + int cmp = cmp_int(path->btree_id, id) ?: @@ -18231,9 +19106,10 @@ index 000000000000..f43044e6fa37 + } + + bch2_dump_trans_paths_updates(trans); ++ bch2_bpos_to_text(&buf, pos); ++ + panic("not locked: %s %s%s\n", -+ bch2_btree_ids[id], -+ (bch2_bpos_to_text(&PBUF(buf), pos), buf), ++ bch2_btree_ids[id], buf.buf, + key_cache ? " cached" : ""); +} + @@ -18419,8 +19295,6 @@ index 000000000000..f43044e6fa37 + struct bkey *u, + struct bkey_packed *k) +{ -+ struct bkey_s_c ret; -+ + if (unlikely(!k)) { + /* + * signal to bch2_btree_iter_peek_slot() that we're currently at @@ -18430,19 +19304,7 @@ index 000000000000..f43044e6fa37 + return bkey_s_c_null; + } + -+ ret = bkey_disassemble(l->b, k, u); -+ -+ /* -+ * XXX: bch2_btree_bset_insert_key() generates invalid keys when we -+ * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key -+ * being overwritten but doesn't change k->size. But this is ok, because -+ * those keys are never written out, we just have to avoid a spurious -+ * assertion here: -+ */ -+ if (bch2_debug_check_bkeys && !bkey_deleted(ret.k)) -+ bch2_bkey_debugcheck(c, l->b, ret); -+ -+ return ret; ++ return bkey_disassemble(l->b, k, u); +} + +static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, @@ -18502,6 +19364,7 @@ index 000000000000..f43044e6fa37 +static void btree_path_verify_new_node(struct btree_trans *trans, + struct btree_path *path, struct btree *b) +{ ++ struct bch_fs *c = trans->c; + struct btree_path_level *l; + unsigned plevel; + bool parent_locked; @@ -18510,6 +19373,9 @@ index 000000000000..f43044e6fa37 + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + return; + ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) ++ return; ++ + plevel = b->c.level + 1; + if (!btree_path_node(path, plevel)) + return; @@ -18524,23 +19390,23 @@ index 000000000000..f43044e6fa37 + if (!k || + bkey_deleted(k) || + bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { -+ char buf1[100]; -+ char buf2[100]; -+ char buf3[100]; -+ char buf4[100]; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf3 = PRINTBUF; ++ struct printbuf buf4 = PRINTBUF; + struct bkey uk = bkey_unpack_key(b, k); + -+ bch2_dump_btree_node(trans->c, l->b); -+ bch2_bpos_to_text(&PBUF(buf1), path->pos); -+ bch2_bkey_to_text(&PBUF(buf2), &uk); -+ bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); -+ bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); ++ bch2_dump_btree_node(c, l->b); ++ bch2_bpos_to_text(&buf1, path->pos); ++ bch2_bkey_to_text(&buf2, &uk); ++ bch2_bpos_to_text(&buf3, b->data->min_key); ++ bch2_bpos_to_text(&buf3, b->data->max_key); + panic("parent iter doesn't point to new node:\n" + "iter pos %s %s\n" + "iter key %s\n" + "new node %s-%s\n", -+ bch2_btree_ids[path->btree_id], buf1, -+ buf2, buf3, buf4); ++ bch2_btree_ids[path->btree_id], ++ buf1.buf, buf2.buf, buf3.buf, buf4.buf); + } + + if (!parent_locked) @@ -18598,7 +19464,7 @@ index 000000000000..f43044e6fa37 + t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(path, b->c.level); + six_lock_increment(&b->c.lock, t); -+ mark_btree_node_locked(path, b->c.level, t); ++ mark_btree_node_locked(trans, path, b->c.level, t); + } + + btree_path_level_init(trans, path, b); @@ -18675,7 +19541,7 @@ index 000000000000..f43044e6fa37 + for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + -+ mark_btree_node_locked(path, path->level, lock_type); ++ mark_btree_node_locked(trans, path, path->level, lock_type); + btree_path_level_init(trans, path, b); + return 0; + } @@ -18721,6 +19587,41 @@ index 000000000000..f43044e6fa37 + return ret; +} + ++static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, ++ struct btree_and_journal_iter *jiter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct bkey_buf tmp; ++ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ++ ? (path->level > 1 ? 0 : 2) ++ : (path->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(path, path->level); ++ int ret = 0; ++ ++ bch2_bkey_buf_init(&tmp); ++ ++ while (nr && !ret) { ++ if (!bch2_btree_node_relock(trans, path, path->level)) ++ break; ++ ++ bch2_btree_and_journal_iter_advance(jiter); ++ k = bch2_btree_and_journal_iter_peek(jiter); ++ if (!k.k) ++ break; ++ ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, ++ path->level - 1); ++ } ++ ++ if (!was_locked) ++ btree_node_unlock(path, path->level); ++ ++ bch2_bkey_buf_exit(&tmp, c); ++ return ret; ++} ++ +static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + struct btree_path *path, + unsigned plevel, struct btree *b) @@ -18743,6 +19644,30 @@ index 000000000000..f43044e6fa37 + btree_node_unlock(path, plevel); +} + ++static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned flags, ++ struct bkey_buf *out) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path_level *l = path_l(path); ++ struct btree_and_journal_iter jiter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); ++ ++ k = bch2_btree_and_journal_iter_peek(&jiter); ++ ++ bch2_bkey_buf_reassemble(out, c, k); ++ ++ if (flags & BTREE_ITER_PREFETCH) ++ ret = btree_path_prefetch_j(trans, path, &jiter); ++ ++ bch2_btree_and_journal_iter_exit(&jiter); ++ return ret; ++} ++ +static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, @@ -18753,30 +19678,41 @@ index 000000000000..f43044e6fa37 + struct btree *b; + unsigned level = path->level - 1; + enum six_lock_type lock_type = __btree_lock_want(path, level); ++ bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + struct bkey_buf tmp; + int ret; + + EBUG_ON(!btree_node_locked(path, path->level)); + + bch2_bkey_buf_init(&tmp); -+ bch2_bkey_buf_unpack(&tmp, c, l->b, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ if (unlikely(!replay_done)) { ++ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); ++ if (ret) ++ goto err; ++ } else { ++ bch2_bkey_buf_unpack(&tmp, c, l->b, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ if (flags & BTREE_ITER_PREFETCH) { ++ ret = btree_path_prefetch(trans, path); ++ if (ret) ++ goto err; ++ } ++ } + + b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); + if (unlikely(ret)) + goto err; + -+ mark_btree_node_locked(path, level, lock_type); ++ mark_btree_node_locked(trans, path, level, lock_type); + btree_path_level_init(trans, path, b); + -+ if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && ++ if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && + unlikely(b != btree_node_mem_ptr(tmp.k))) + btree_node_mem_ptr_set(trans, path, level + 1, b); + -+ if (flags & BTREE_ITER_PREFETCH) -+ ret = btree_path_prefetch(trans, path); -+ + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(path, level + 1); + path->level = level; @@ -18790,12 +19726,12 @@ index 000000000000..f43044e6fa37 +static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, + unsigned, unsigned long); + -+static int __btree_path_traverse_all(struct btree_trans *trans, int ret, -+ unsigned long trace_ip) ++static int bch2_btree_path_traverse_all(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_path *path; -+ int i; ++ unsigned long trace_ip = _RET_IP_; ++ int i, ret = 0; + + if (trans->in_traverse_all) + return -EINTR; @@ -18803,6 +19739,7 @@ index 000000000000..f43044e6fa37 + trans->in_traverse_all = true; +retry_all: + trans->restarted = false; ++ trans->traverse_all_idx = U8_MAX; + + trans_for_each_path(trans, path) + path->should_be_locked = false; @@ -18823,7 +19760,7 @@ index 000000000000..f43044e6fa37 + bch2_trans_unlock(trans); + cond_resched(); + -+ if (unlikely(ret == -ENOMEM)) { ++ if (unlikely(trans->memory_allocation_failure)) { + struct closure cl; + + closure_init_stack(&cl); @@ -18834,27 +19771,25 @@ index 000000000000..f43044e6fa37 + } while (ret); + } + -+ if (unlikely(ret == -EIO)) -+ goto out; -+ -+ BUG_ON(ret && ret != -EINTR); -+ + /* Now, redo traversals in correct order: */ -+ i = 0; -+ while (i < trans->nr_sorted) { -+ path = trans->paths + trans->sorted[i]; ++ trans->traverse_all_idx = 0; ++ while (trans->traverse_all_idx < trans->nr_sorted) { ++ path = trans->paths + trans->sorted[trans->traverse_all_idx]; + -+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); -+ -+ ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); -+ if (ret) -+ goto retry_all; -+ -+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); -+ -+ if (path->nodes_locked || -+ !btree_path_node(path, path->level)) -+ i++; ++ /* ++ * Traversing a path can cause another path to be added at about ++ * the same position: ++ */ ++ if (path->uptodate) { ++ ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); ++ if (ret == -EINTR || ret == -ENOMEM) ++ goto retry_all; ++ if (ret) ++ goto err; ++ BUG_ON(path->uptodate); ++ } else { ++ trans->traverse_all_idx++; ++ } + } + + /* @@ -18864,20 +19799,15 @@ index 000000000000..f43044e6fa37 + */ + trans_for_each_path(trans, path) + BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); -+out: ++err: + bch2_btree_cache_cannibalize_unlock(c); + + trans->in_traverse_all = false; + -+ trace_trans_traverse_all(trans->ip, trace_ip); ++ trace_trans_traverse_all(trans->fn, trace_ip); + return ret; +} + -+static int bch2_btree_path_traverse_all(struct btree_trans *trans) -+{ -+ return __btree_path_traverse_all(trans, 0, _RET_IP_); -+} -+ +static inline bool btree_path_good_node(struct btree_trans *trans, + struct btree_path *path, + unsigned l, int check_pos) @@ -19001,8 +19931,6 @@ index 000000000000..f43044e6fa37 + return ret; +} + -+static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long); -+ +int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) +{ @@ -19026,7 +19954,7 @@ index 000000000000..f43044e6fa37 + six_lock_increment(&dst->l[i].b->c.lock, + __btree_lock_want(dst, i)); + -+ btree_path_check_sort(trans, dst, 0); ++ bch2_btree_path_check_sort(trans, dst, 0); +} + +static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, @@ -19041,25 +19969,27 @@ index 000000000000..f43044e6fa37 + +inline struct btree_path * __must_check +bch2_btree_path_make_mut(struct btree_trans *trans, -+ struct btree_path *path, bool intent) ++ struct btree_path *path, bool intent, ++ unsigned long ip) +{ + if (path->ref > 1 || path->preserve) { + __btree_path_put(path, intent); + path = btree_path_clone(trans, path, intent); + path->preserve = false; +#ifdef CONFIG_BCACHEFS_DEBUG -+ path->ip_allocated = _RET_IP_; ++ path->ip_allocated = ip; +#endif + btree_trans_verify_sorted(trans); + } + ++ path->should_be_locked = false; + return path; +} + -+static struct btree_path * __must_check -+btree_path_set_pos(struct btree_trans *trans, ++struct btree_path * __must_check ++bch2_btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, -+ bool intent) ++ bool intent, unsigned long ip) +{ + int cmp = bpos_cmp(new_pos, path->pos); + unsigned l = path->level; @@ -19070,12 +20000,11 @@ index 000000000000..f43044e6fa37 + if (!cmp) + return path; + -+ path = bch2_btree_path_make_mut(trans, path, intent); ++ path = bch2_btree_path_make_mut(trans, path, intent, ip); + -+ path->pos = new_pos; -+ path->should_be_locked = false; ++ path->pos = new_pos; + -+ btree_path_check_sort(trans, path, cmp); ++ bch2_btree_path_check_sort(trans, path, cmp); + + if (unlikely(path->cached)) { + btree_node_unlock(path, 0); @@ -19087,6 +20016,7 @@ index 000000000000..f43044e6fa37 + l = btree_path_up_until_good_node(trans, path, cmp); + + if (btree_path_node(path, l)) { ++ BUG_ON(!btree_node_locked(path, l)); + /* + * We might have to skip over many keys, or just a few: try + * advancing the node iterator, and if we have to skip over too @@ -19179,23 +20109,64 @@ index 000000000000..f43044e6fa37 + __bch2_path_free(trans, path); +} + ++void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ ++ pr_buf(buf, "transaction updates for %s journal seq %llu", ++ trans->fn, trans->journal_res.seq); ++ pr_newline(buf); ++ pr_indent_push(buf, 2); ++ ++ trans_for_each_update(trans, i) { ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ ++ pr_buf(buf, "update: btree %s %pS", ++ bch2_btree_ids[i->btree_id], ++ (void *) i->ip_allocated); ++ pr_newline(buf); ++ ++ pr_buf(buf, " old "); ++ bch2_bkey_val_to_text(buf, trans->c, old); ++ pr_newline(buf); ++ ++ pr_buf(buf, " new "); ++ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); ++ pr_newline(buf); ++ } ++ ++ pr_indent_pop(buf, 2); ++} ++ ++noinline __cold ++void bch2_dump_trans_updates(struct btree_trans *trans) ++{ ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_trans_updates_to_text(&buf, trans); ++ bch_err(trans->c, "%s", buf.buf); ++ printbuf_exit(&buf); ++} ++ +noinline __cold +void bch2_dump_trans_paths_updates(struct btree_trans *trans) +{ + struct btree_path *path; -+ struct btree_insert_entry *i; ++ struct printbuf buf = PRINTBUF; + unsigned idx; -+ char buf1[300], buf2[300]; + -+ btree_trans_verify_sorted(trans); ++ trans_for_each_path_inorder(trans, path, idx) { ++ printbuf_reset(&buf); + -+ trans_for_each_path_inorder(trans, path, idx) -+ printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", ++ bch2_bpos_to_text(&buf, path->pos); ++ ++ printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n", + path->idx, path->ref, path->intent_ref, + path->should_be_locked ? " S" : "", + path->preserve ? " P" : "", + bch2_btree_ids[path->btree_id], -+ (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), ++ path->level, ++ buf.buf, + path->nodes_locked, +#ifdef CONFIG_BCACHEFS_DEBUG + (void *) path->ip_allocated @@ -19203,17 +20174,11 @@ index 000000000000..f43044e6fa37 + NULL +#endif + ); -+ -+ trans_for_each_update(trans, i) { -+ struct bkey u; -+ struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); -+ -+ printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", -+ bch2_btree_ids[i->btree_id], -+ (void *) i->ip_allocated, -+ (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1), -+ (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2)); + } ++ ++ printbuf_exit(&buf); ++ ++ bch2_dump_trans_updates(trans); +} + +static struct btree_path *btree_path_alloc(struct btree_trans *trans, @@ -19243,15 +20208,19 @@ index 000000000000..f43044e6fa37 + return path; +} + -+struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, ++struct btree_path *bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, -+ bool intent) ++ unsigned flags, unsigned long ip) +{ + struct btree_path *path, *path_pos = NULL; ++ bool cached = flags & BTREE_ITER_CACHED; ++ bool intent = flags & BTREE_ITER_INTENT; + int i; + + BUG_ON(trans->restarted); ++ btree_trans_verify_sorted(trans); ++ bch2_trans_verify_locks(trans); + + trans_for_each_path_inorder(trans, path, i) { + if (__btree_path_cmp(path, @@ -19269,8 +20238,7 @@ index 000000000000..f43044e6fa37 + path_pos->btree_id == btree_id && + path_pos->level == level) { + __btree_path_get(path_pos, intent); -+ path = btree_path_set_pos(trans, path_pos, pos, intent); -+ path->preserve = true; ++ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + } else { + path = btree_path_alloc(trans, path_pos); + path_pos = NULL; @@ -19279,7 +20247,6 @@ index 000000000000..f43044e6fa37 + path->pos = pos; + path->btree_id = btree_id; + path->cached = cached; -+ path->preserve = true; + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + path->should_be_locked = false; + path->level = level; @@ -19289,11 +20256,14 @@ index 000000000000..f43044e6fa37 + for (i = 0; i < ARRAY_SIZE(path->l); i++) + path->l[i].b = BTREE_ITER_NO_NODE_INIT; +#ifdef CONFIG_BCACHEFS_DEBUG -+ path->ip_allocated = _RET_IP_; ++ path->ip_allocated = ip; +#endif + btree_trans_verify_sorted(trans); + } + ++ if (!(flags & BTREE_ITER_NOPRESERVE)) ++ path->preserve = true; ++ + if (path->intent_ref) + locks_want = max(locks_want, level + 1); + @@ -19308,7 +20278,7 @@ index 000000000000..f43044e6fa37 + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > path->locks_want) { + path->locks_want = locks_want; -+ btree_path_get_locks(trans, path, true, _THIS_IP_); ++ btree_path_get_locks(trans, path, true); + } + + return path; @@ -19319,13 +20289,13 @@ index 000000000000..f43044e6fa37 + + struct bkey_s_c k; + -+ BUG_ON(path->uptodate != BTREE_ITER_UPTODATE); -+ + if (!path->cached) { + struct btree_path_level *l = path_l(path); -+ struct bkey_packed *_k = -+ bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ struct bkey_packed *_k; + ++ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); ++ ++ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; + + EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); @@ -19335,13 +20305,17 @@ index 000000000000..f43044e6fa37 + } else { + struct bkey_cached *ck = (void *) path->l[0].b; + -+ EBUG_ON(path->btree_id != ck->key.btree_id || -+ bkey_cmp(path->pos, ck->key.pos)); ++ EBUG_ON(ck && ++ (path->btree_id != ck->key.btree_id || ++ bkey_cmp(path->pos, ck->key.pos))); + -+ /* BTREE_ITER_CACHED_NOFILL? */ -+ if (unlikely(!ck->valid)) -+ goto hole; ++ /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ ++ if (unlikely(!ck || !ck->valid)) ++ return bkey_s_c_null; + ++ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); ++ ++ *u = ck->k->k; + k = bkey_i_to_s_c(ck->k); + } + @@ -19365,9 +20339,10 @@ index 000000000000..f43044e6fa37 +{ + int ret; + -+ iter->path = btree_path_set_pos(iter->trans, iter->path, ++ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + btree_iter_search_key(iter), -+ iter->flags & BTREE_ITER_INTENT); ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + if (ret) @@ -19401,8 +20376,9 @@ index 000000000000..f43044e6fa37 + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + -+ iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, -+ iter->flags & BTREE_ITER_INTENT); ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + iter->path->should_be_locked = true; + BUG_ON(iter->path->uptodate); +out: @@ -19436,6 +20412,7 @@ index 000000000000..f43044e6fa37 + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + return NULL; + } + @@ -19443,6 +20420,9 @@ index 000000000000..f43044e6fa37 + __bch2_btree_path_unlock(path); + path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; + path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, ++ path->btree_id, &path->pos); + btree_trans_restart(trans); + ret = -EINTR; + goto err; @@ -19460,8 +20440,9 @@ index 000000000000..f43044e6fa37 + * the next child node + */ + path = iter->path = -+ btree_path_set_pos(trans, path, bpos_successor(iter->pos), -+ iter->flags & BTREE_ITER_INTENT); ++ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + path->level = iter->min_depth; + @@ -19482,8 +20463,9 @@ index 000000000000..f43044e6fa37 + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + -+ iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, -+ iter->flags & BTREE_ITER_INTENT); ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + iter->path->should_be_locked = true; + BUG_ON(iter->path->uptodate); +out: @@ -19524,25 +20506,90 @@ index 000000000000..f43044e6fa37 + return ret; +} + -+/** -+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's -+ * current position ++static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ if ((cmp_int(btree_id, i->btree_id) ?: ++ bpos_cmp(pos, i->k->k.p)) <= 0) { ++ if (btree_id == i->btree_id) ++ return i->k; ++ break; ++ } ++ ++ return NULL; ++} ++ ++static noinline ++struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_i *next_journal = ++ bch2_journal_keys_peek(trans->c, iter->btree_id, 0, ++ iter->path->pos); ++ ++ if (next_journal && ++ bpos_cmp(next_journal->k.p, ++ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { ++ iter->k = next_journal->k; ++ k = bkey_i_to_s_c(next_journal); ++ } ++ ++ return k; ++} ++ ++/* ++ * Checks btree key cache for key at iter->pos and returns it if present, or ++ * bkey_s_c_null: + */ -+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++static noinline ++struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; ++ struct bkey u; ++ int ret; ++ ++ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) ++ return bkey_s_c_null; ++ ++ if (!iter->key_cache_path) ++ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, ++ iter->flags & BTREE_ITER_INTENT, 0, ++ iter->flags|BTREE_ITER_CACHED, ++ _THIS_IP_); ++ ++ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ iter->key_cache_path->should_be_locked = true; ++ ++ return bch2_btree_path_peek_slot(iter->key_cache_path, &u); ++} ++ ++static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) +{ + struct btree_trans *trans = iter->trans; -+ struct bpos search_key = btree_iter_search_key(iter); + struct bkey_i *next_update; -+ struct bkey_s_c k; -+ int ret, cmp; ++ struct bkey_s_c k, k2; ++ int ret; + + EBUG_ON(iter->path->cached || iter->path->level); + bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify_entry_exit(iter); + + while (1) { -+ iter->path = btree_path_set_pos(trans, iter->path, search_key, -+ iter->flags & BTREE_ITER_INTENT); ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { @@ -19552,19 +20599,30 @@ index 000000000000..f43044e6fa37 + goto out; + } + ++ iter->path->should_be_locked = true; ++ ++ k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); ++ ++ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && ++ k.k && ++ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { ++ ret = bkey_err(k2); ++ if (ret) { ++ k = k2; ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ goto out; ++ } ++ ++ k = k2; ++ iter->k = *k.k; ++ } ++ ++ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) ++ k = btree_trans_peek_journal(trans, iter, k); ++ + next_update = iter->flags & BTREE_ITER_WITH_UPDATES + ? btree_trans_peek_updates(trans, iter->btree_id, search_key) + : NULL; -+ k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); -+ -+ /* * In the btree, deleted keys sort before non deleted: */ -+ if (k.k && bkey_deleted(k.k) && -+ (!next_update || -+ bpos_cmp(k.k->p, next_update->k.p) <= 0)) { -+ search_key = k.k->p; -+ continue; -+ } -+ + if (next_update && + bpos_cmp(next_update->k.p, + k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { @@ -19572,25 +20630,21 @@ index 000000000000..f43044e6fa37 + k = bkey_i_to_s_c(next_update); + } + -+ if (likely(k.k)) { ++ if (k.k && bkey_deleted(k.k)) { + /* -+ * We can never have a key in a leaf node at POS_MAX, so -+ * we don't have to check these successor() calls: ++ * If we've got a whiteout, and it's after the search ++ * key, advance the search key to the whiteout instead ++ * of just after the whiteout - it might be a btree ++ * whiteout, with a real key at the same position, since ++ * in the btree deleted keys sort before non deleted. + */ -+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && -+ !bch2_snapshot_is_ancestor(trans->c, -+ iter->snapshot, -+ k.k->p.snapshot)) { -+ search_key = bpos_successor(k.k->p); -+ continue; -+ } -+ -+ if (bkey_whiteout(k.k) && -+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { -+ search_key = bkey_successor(iter, k.k->p); -+ continue; -+ } ++ search_key = bpos_cmp(search_key, k.k->p) ++ ? k.k->p ++ : bpos_successor(k.k->p); ++ continue; ++ } + ++ if (likely(k.k)) { + break; + } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ @@ -19602,34 +20656,137 @@ index 000000000000..f43044e6fa37 + goto out; + } + } -+ -+ /* -+ * iter->pos should be mononotically increasing, and always be equal to -+ * the key we just returned - except extents can straddle iter->pos: -+ */ -+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) -+ iter->pos = k.k->p; -+ else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) -+ iter->pos = bkey_start_pos(k.k); -+ -+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) -+ iter->pos.snapshot = iter->snapshot; -+ -+ cmp = bpos_cmp(k.k->p, iter->path->pos); -+ if (cmp) { -+ iter->path = bch2_btree_path_make_mut(trans, iter->path, -+ iter->flags & BTREE_ITER_INTENT); -+ iter->path->pos = k.k->p; -+ btree_path_check_sort(trans, iter->path, cmp); -+ } +out: -+ iter->path->should_be_locked = true; ++ bch2_btree_iter_verify(iter); ++ ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bpos search_key = btree_iter_search_key(iter); ++ struct bkey_s_c k; ++ struct bpos iter_pos; ++ int ret; ++ ++ if (iter->update_path) { ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ } + + bch2_btree_iter_verify_entry_exit(iter); -+ bch2_btree_iter_verify(iter); ++ ++ while (1) { ++ k = __bch2_btree_iter_peek(iter, search_key); ++ if (!k.k || bkey_err(k)) ++ goto out; ++ ++ /* ++ * iter->pos should be mononotically increasing, and always be ++ * equal to the key we just returned - except extents can ++ * straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ iter_pos = k.k->p; ++ else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter_pos = bkey_start_pos(k.k); ++ else ++ iter_pos = iter->pos; ++ ++ if (bkey_cmp(iter_pos, end) > 0) { ++ bch2_btree_iter_set_pos(iter, end); ++ k = bkey_s_c_null; ++ goto out; ++ } ++ ++ if (iter->update_path && ++ bkey_cmp(iter->update_path->pos, k.k->p)) { ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ } ++ ++ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ (iter->flags & BTREE_ITER_INTENT) && ++ !(iter->flags & BTREE_ITER_IS_EXTENTS) && ++ !iter->update_path) { ++ struct bpos pos = k.k->p; ++ ++ if (pos.snapshot < iter->snapshot) { ++ search_key = bpos_successor(k.k->p); ++ continue; ++ } ++ ++ pos.snapshot = iter->snapshot; ++ ++ /* ++ * advance, same as on exit for iter->path, but only up ++ * to snapshot ++ */ ++ __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = iter->path; ++ ++ iter->update_path = bch2_btree_path_set_pos(trans, ++ iter->update_path, pos, ++ iter->flags & BTREE_ITER_INTENT, ++ _THIS_IP_); ++ } ++ ++ /* ++ * We can never have a key in a leaf node at POS_MAX, so ++ * we don't have to check these successor() calls: ++ */ ++ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ !bch2_snapshot_is_ancestor(trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)) { ++ search_key = bpos_successor(k.k->p); ++ continue; ++ } ++ ++ if (bkey_whiteout(k.k) && ++ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { ++ search_key = bkey_successor(iter, k.k->p); ++ continue; ++ } ++ ++ break; ++ } ++ ++ iter->pos = iter_pos; ++ ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ BUG_ON(!iter->path->nodes_locked); ++out: ++ if (iter->update_path) { ++ if (iter->update_path->uptodate && ++ !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) { ++ k = bkey_s_c_err(-EINTR); ++ } else { ++ BUG_ON(!(iter->update_path->nodes_locked & 1)); ++ iter->update_path->should_be_locked = true; ++ } ++ } ++ iter->path->should_be_locked = true; ++ ++ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ iter->pos.snapshot = iter->snapshot; ++ + ret = bch2_btree_iter_verify_ret(iter, k); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); ++ if (unlikely(ret)) { ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ k = bkey_s_c_err(ret); ++ } ++ ++ bch2_btree_iter_verify_entry_exit(iter); + + return k; +} @@ -19662,6 +20819,10 @@ index 000000000000..f43044e6fa37 + + EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); ++ ++ if (iter->flags & BTREE_ITER_WITH_JOURNAL) ++ return bkey_s_c_err(-EIO); ++ + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + @@ -19669,8 +20830,9 @@ index 000000000000..f43044e6fa37 + search_key.snapshot = U32_MAX; + + while (1) { -+ iter->path = btree_path_set_pos(trans, iter->path, search_key, -+ iter->flags & BTREE_ITER_INTENT); ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { @@ -19689,7 +20851,7 @@ index 000000000000..f43044e6fa37 + k = btree_path_level_prev(trans->c, iter->path, + &iter->path->l[0], &iter->k); + -+ btree_path_check_sort(trans, iter->path, 0); ++ bch2_btree_path_check_sort(trans, iter->path, 0); + + if (likely(k.k)) { + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { @@ -19799,8 +20961,9 @@ index 000000000000..f43044e6fa37 + } + + search_key = btree_iter_search_key(iter); -+ iter->path = btree_path_set_pos(trans, iter->path, search_key, -+ iter->flags & BTREE_ITER_INTENT); ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) @@ -19810,25 +20973,44 @@ index 000000000000..f43044e6fa37 + !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + struct bkey_i *next_update; + -+ next_update = iter->flags & BTREE_ITER_WITH_UPDATES -+ ? btree_trans_peek_updates(trans, iter->btree_id, search_key) -+ : NULL; -+ -+ if (next_update && ++ if ((iter->flags & BTREE_ITER_WITH_UPDATES) && ++ (next_update = btree_trans_peek_updates(trans, ++ iter->btree_id, search_key)) && + !bpos_cmp(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); -+ } else { -+ k = bch2_btree_path_peek_slot(iter->path, &iter->k); ++ goto out; + } ++ ++ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && ++ (next_update = bch2_journal_keys_peek(trans->c, iter->btree_id, ++ 0, iter->pos)) && ++ !bpos_cmp(next_update->k.p, iter->pos)) { ++ iter->k = next_update->k; ++ k = bkey_i_to_s_c(next_update); ++ goto out; ++ } ++ ++ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && ++ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { ++ if (!bkey_err(k)) ++ iter->k = *k.k; ++ goto out; ++ } ++ ++ k = bch2_btree_path_peek_slot(iter->path, &iter->k); + } else { + struct bpos next; + + if (iter->flags & BTREE_ITER_INTENT) { + struct btree_iter iter2; ++ struct bpos end = iter->pos; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ end.offset = U64_MAX; + + bch2_trans_copy_iter(&iter2, iter); -+ k = bch2_btree_iter_peek(&iter2); ++ k = bch2_btree_iter_peek_upto(&iter2, end); + + if (k.k && !bkey_err(k)) { + iter->k = iter2.k; @@ -19850,18 +21032,21 @@ index 000000000000..f43044e6fa37 + if (bkey_cmp(iter->pos, next) < 0) { + bkey_init(&iter->k); + iter->k.p = iter->pos; -+ bch2_key_resize(&iter->k, -+ min_t(u64, KEY_SIZE_MAX, -+ (next.inode == iter->pos.inode -+ ? next.offset -+ : KEY_OFFSET_MAX) - -+ iter->pos.offset)); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) { ++ bch2_key_resize(&iter->k, ++ min_t(u64, KEY_SIZE_MAX, ++ (next.inode == iter->pos.inode ++ ? next.offset ++ : KEY_OFFSET_MAX) - ++ iter->pos.offset)); ++ EBUG_ON(!iter->k.size); ++ } + + k = (struct bkey_s_c) { &iter->k, NULL }; -+ EBUG_ON(!k.k->size); + } + } -+ ++out: + iter->path->should_be_locked = true; + + bch2_btree_iter_verify_entry_exit(iter); @@ -19916,7 +21101,10 @@ index 000000000000..f43044e6fa37 + unsigned i; + + trans_for_each_path_inorder(trans, path, i) { -+ BUG_ON(prev && btree_path_cmp(prev, path) > 0); ++ if (prev && btree_path_cmp(prev, path) > 0) { ++ bch2_dump_trans_paths_updates(trans); ++ panic("trans paths out of order!\n"); ++ } + prev = path; + } +#endif @@ -19933,8 +21121,8 @@ index 000000000000..f43044e6fa37 + btree_path_verify_sorted_ref(trans, r); +} + -+static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, -+ int cmp) ++inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, ++ int cmp) +{ + struct btree_path *n; + @@ -19990,6 +21178,11 @@ index 000000000000..f43044e6fa37 + + path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; + ++ if (trans->in_traverse_all && ++ trans->traverse_all_idx != U8_MAX && ++ trans->traverse_all_idx >= path->sorted_idx) ++ trans->traverse_all_idx++; ++ + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); + + for (i = path->sorted_idx; i < trans->nr_sorted; i++) @@ -20003,7 +21196,15 @@ index 000000000000..f43044e6fa37 + if (iter->path) + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); ++ if (iter->update_path) ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ if (iter->key_cache_path) ++ bch2_path_put(trans, iter->key_cache_path, ++ iter->flags & BTREE_ITER_INTENT); + iter->path = NULL; ++ iter->update_path = NULL; ++ iter->key_cache_path = NULL; +} + +static void __bch2_trans_iter_init(struct btree_trans *trans, @@ -20011,7 +21212,8 @@ index 000000000000..f43044e6fa37 + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, -+ unsigned flags) ++ unsigned flags, ++ unsigned long ip) +{ + EBUG_ON(trans->restarted); + @@ -20027,8 +21229,19 @@ index 000000000000..f43044e6fa37 + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; + ++ if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) ++ flags |= BTREE_ITER_WITH_JOURNAL; ++ ++ if (!btree_id_cached(trans->c, btree_id)) { ++ flags &= ~BTREE_ITER_CACHED; ++ flags &= ~BTREE_ITER_WITH_KEY_CACHE; ++ } else if (!(flags & BTREE_ITER_CACHED)) ++ flags |= BTREE_ITER_WITH_KEY_CACHE; ++ + iter->trans = trans; + iter->path = NULL; ++ iter->update_path = NULL; ++ iter->key_cache_path = NULL; + iter->btree_id = btree_id; + iter->min_depth = depth; + iter->flags = flags; @@ -20037,14 +21250,12 @@ index 000000000000..f43044e6fa37 + iter->k.type = KEY_TYPE_deleted; + iter->k.p = pos; + iter->k.size = 0; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ iter->ip_allocated = ip; ++#endif + -+ iter->path = bch2_path_get(trans, -+ flags & BTREE_ITER_CACHED, -+ btree_id, -+ iter->pos, -+ locks_want, -+ depth, -+ flags & BTREE_ITER_INTENT); ++ iter->path = bch2_path_get(trans, btree_id, iter->pos, ++ locks_want, depth, flags, ip); +} + +void bch2_trans_iter_init(struct btree_trans *trans, @@ -20053,7 +21264,7 @@ index 000000000000..f43044e6fa37 + unsigned flags) +{ + __bch2_trans_iter_init(trans, iter, btree_id, pos, -+ 0, 0, flags); ++ 0, 0, flags, _RET_IP_); +} + +void bch2_trans_node_iter_init(struct btree_trans *trans, @@ -20068,7 +21279,7 @@ index 000000000000..f43044e6fa37 + BTREE_ITER_NOT_EXTENTS| + __BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_ALL_SNAPSHOTS| -+ flags); ++ flags, _RET_IP_); + BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(iter->path->level != depth); + BUG_ON(iter->min_depth != depth); @@ -20079,6 +21290,9 @@ index 000000000000..f43044e6fa37 + *dst = *src; + if (src->path) + __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); ++ if (src->update_path) ++ __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); ++ dst->key_cache_path = NULL; +} + +void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) @@ -20107,7 +21321,7 @@ index 000000000000..f43044e6fa37 + trans->mem_bytes = new_bytes; + + if (old_bytes) { -+ trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes); ++ trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } @@ -20141,8 +21355,7 @@ index 000000000000..f43044e6fa37 + trans->mem_top = 0; + + trans->hooks = NULL; -+ trans->extra_journal_entries = NULL; -+ trans->extra_journal_entry_u64s = 0; ++ trans->extra_journal_entries.nr = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; @@ -20155,13 +21368,21 @@ index 000000000000..f43044e6fa37 + path->should_be_locked = false; + + /* ++ * If the transaction wasn't restarted, we're presuming to be ++ * doing something new: dont keep iterators excpt the ones that ++ * are in use - except for the subvolumes btree: ++ */ ++ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) ++ path->preserve = false; ++ ++ /* + * XXX: we probably shouldn't be doing this if the transaction + * was restarted, but currently we still overflow transaction + * iterators if we do that + */ + if (!path->ref && !path->preserve) + __bch2_path_free(trans, path); -+ else if (!path->ref) ++ else + path->preserve = false; + } + @@ -20191,14 +21412,17 @@ index 000000000000..f43044e6fa37 + trans->updates = p; p += updates_bytes; +} + -+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, -+ unsigned expected_nr_iters, -+ size_t expected_mem_bytes) ++void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, ++ unsigned expected_nr_iters, ++ size_t expected_mem_bytes, ++ const char *fn) + __acquires(&c->btree_trans_barrier) +{ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ + memset(trans, 0, sizeof(*trans)); + trans->c = c; -+ trans->ip = _RET_IP_; ++ trans->fn = fn; + + bch2_trans_alloc_paths(trans, c); + @@ -20214,12 +21438,10 @@ index 000000000000..f43044e6fa37 + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + -+#ifdef CONFIG_BCACHEFS_DEBUG + trans->pid = current->pid; + mutex_lock(&c->btree_trans_lock); + list_add(&trans->list, &c->btree_trans_list); + mutex_unlock(&c->btree_trans_lock); -+#endif +} + +static void check_btree_paths_leaked(struct btree_trans *trans) @@ -20233,7 +21455,7 @@ index 000000000000..f43044e6fa37 + goto leaked; + return; +leaked: -+ bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip); ++ bch_err(c, "btree paths leaked from %s!", trans->fn); + trans_for_each_path(trans, path) + if (path->ref) + printk(KERN_ERR " btree %s %pS\n", @@ -20258,16 +21480,16 @@ index 000000000000..f43044e6fa37 + + check_btree_paths_leaked(trans); + -+#ifdef CONFIG_BCACHEFS_DEBUG + mutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + mutex_unlock(&c->btree_trans_lock); -+#endif + + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + ++ kfree(trans->extra_journal_entries.data); ++ + if (trans->fs_usage_deltas) { + if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == + REPLICAS_DELTA_LIST_MAX) @@ -20306,7 +21528,6 @@ index 000000000000..f43044e6fa37 + bch2_bpos_to_text(out, btree_node_pos(_b, cached)); +} + -+#ifdef CONFIG_BCACHEFS_DEBUG +static bool trans_has_locks(struct btree_trans *trans) +{ + struct btree_path *path; @@ -20316,14 +21537,13 @@ index 000000000000..f43044e6fa37 + return true; + return false; +} -+#endif + +void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) +{ -+#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_trans *trans; + struct btree_path *path; + struct btree *b; ++ static char lock_types[] = { 'r', 'i', 'w' }; + unsigned l; + + mutex_lock(&c->btree_trans_lock); @@ -20331,7 +21551,7 @@ index 000000000000..f43044e6fa37 + if (!trans_has_locks(trans)) + continue; + -+ pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); ++ pr_buf(out, "%i %s\n", trans->pid, trans->fn); + + trans_for_each_path(trans, path) { + if (!path->nodes_locked) @@ -20360,10 +21580,11 @@ index 000000000000..f43044e6fa37 + b = READ_ONCE(trans->locking); + if (b) { + path = &trans->paths[trans->locking_path_idx]; -+ pr_buf(out, " locking path %u %c l=%u %s:", ++ pr_buf(out, " locking path %u %c l=%u %c %s:", + trans->locking_path_idx, + path->cached ? 'c' : 'b', + trans->locking_level, ++ lock_types[trans->locking_lock_type], + bch2_btree_ids[trans->locking_btree_id]); + bch2_bpos_to_text(out, trans->locking_pos); + @@ -20374,36 +21595,40 @@ index 000000000000..f43044e6fa37 + } + } + mutex_unlock(&c->btree_trans_lock); -+#endif +} + +void bch2_fs_btree_iter_exit(struct bch_fs *c) +{ ++ if (c->btree_trans_barrier_initialized) ++ cleanup_srcu_struct(&c->btree_trans_barrier); + mempool_exit(&c->btree_trans_mem_pool); + mempool_exit(&c->btree_paths_pool); -+ cleanup_srcu_struct(&c->btree_trans_barrier); +} + +int bch2_fs_btree_iter_init(struct bch_fs *c) +{ + unsigned nr = BTREE_ITER_MAX; ++ int ret; + + INIT_LIST_HEAD(&c->btree_trans_list); + mutex_init(&c->btree_trans_lock); + -+ return init_srcu_struct(&c->btree_trans_barrier) ?: -+ mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, ++ ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, + sizeof(struct btree_path) * nr + + sizeof(struct btree_insert_entry) * nr) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, -+ BTREE_TRANS_MEM_MAX); ++ BTREE_TRANS_MEM_MAX) ?: ++ init_srcu_struct(&c->btree_trans_barrier); ++ if (!ret) ++ c->btree_trans_barrier_initialized = true; ++ return ret; +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 -index 000000000000..31d2dda7ca05 +index 000000000000..f6700295e1a7 --- /dev/null +++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,364 @@ +@@ -0,0 +1,406 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H @@ -20456,11 +21681,6 @@ index 000000000000..31d2dda7ca05 + return btree_path_node(path, b->c.level + 1); +} + -+static inline int btree_iter_err(const struct btree_iter *iter) -+{ -+ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; -+} -+ +/* Iterate over paths within a transaction: */ + +static inline struct btree_path * @@ -20481,6 +21701,8 @@ index 000000000000..31d2dda7ca05 + return &trans->paths[idx]; +} + ++void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); ++ +#define trans_for_each_path(_trans, _path) \ + for (_path = __trans_next_path((_trans), 0); \ + (_path); \ @@ -20536,11 +21758,15 @@ index 000000000000..31d2dda7ca05 + (_path)->idx + 1)) + +struct btree_path * __must_check -+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool); ++bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, ++ bool, unsigned long); ++struct btree_path * __must_check ++bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, ++ struct bpos, bool, unsigned long); +int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); -+struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id, -+ struct bpos, unsigned, unsigned, bool); ++struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned, unsigned long); +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); + +#ifdef CONFIG_BCACHEFS_DEBUG @@ -20614,9 +21840,14 @@ index 000000000000..31d2dda7ca05 +struct btree *bch2_btree_iter_peek_node(struct btree_iter *); +struct btree *bch2_btree_iter_next_node(struct btree_iter *); + -+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + ++static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ return bch2_btree_iter_peek_upto(iter, SPOS_MAX); ++} ++ +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + @@ -20627,11 +21858,8 @@ index 000000000000..31d2dda7ca05 +bool bch2_btree_iter_advance(struct btree_iter *); +bool bch2_btree_iter_rewind(struct btree_iter *); + -+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ -+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) -+ new_pos.snapshot = iter->snapshot; -+ + iter->k.type = KEY_TYPE_deleted; + iter->k.p.inode = iter->pos.inode = new_pos.inode; + iter->k.p.offset = iter->pos.offset = new_pos.offset; @@ -20639,6 +21867,19 @@ index 000000000000..31d2dda7ca05 + iter->k.size = 0; +} + ++static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ if (unlikely(iter->update_path)) ++ bch2_path_put(iter->trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ ++ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ new_pos.snapshot = iter->snapshot; ++ ++ __bch2_btree_iter_set_pos(iter, new_pos); ++} ++ +static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) +{ + BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); @@ -20700,14 +21941,27 @@ index 000000000000..31d2dda7ca05 + return PTR_ERR_OR_ZERO(k.k); +} + -+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, -+ unsigned flags) ++static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, ++ unsigned flags) +{ + return flags & BTREE_ITER_SLOTS + ? bch2_btree_iter_peek_slot(iter) + : bch2_btree_iter_peek(iter); +} + ++static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, ++ struct bpos end, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_ITER_SLOTS)) ++ return bch2_btree_iter_peek_upto(iter, end); ++ ++ if (bkey_cmp(iter->pos, end) > 0) ++ return bkey_s_c_null; ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ +static inline int btree_trans_too_many_iters(struct btree_trans *trans) +{ + return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 @@ -20721,7 +21975,7 @@ index 000000000000..31d2dda7ca05 + struct bkey_s_c k; + + while (btree_trans_too_many_iters(trans) || -+ (k = __bch2_btree_iter_peek(iter, flags), ++ (k = bch2_btree_iter_peek_type(iter, flags), + bkey_err(k) == -EINTR)) + bch2_trans_begin(trans); + @@ -20740,7 +21994,15 @@ index 000000000000..31d2dda7ca05 + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ -+ (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ ++ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ ++#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ ++ _start, _end, _flags, _k, _ret) \ ++ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + @@ -20752,16 +22014,21 @@ index 000000000000..31d2dda7ca05 + +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for (; \ -+ (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ ++ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +/* new multiple iterator interface: */ + ++void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); ++void bch2_dump_trans_updates(struct btree_trans *); +void bch2_dump_trans_paths_updates(struct btree_trans *); -+void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); ++void __bch2_trans_init(struct btree_trans *, struct bch_fs *, ++ unsigned, size_t, const char *); +void bch2_trans_exit(struct btree_trans *); + ++#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__) ++ +void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); + +void bch2_fs_btree_iter_exit(struct bch_fs *); @@ -20770,10 +22037,10 @@ index 000000000000..31d2dda7ca05 +#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 -index 000000000000..4f1bc1d165aa +index 000000000000..f5a942b6bbf7 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,736 @@ +@@ -0,0 +1,743 @@ + +#include "bcachefs.h" +#include "btree_cache.h" @@ -20922,28 +22189,32 @@ index 000000000000..4f1bc1d165aa +} + +static struct bkey_cached * -+btree_key_cache_create(struct btree_key_cache *c, ++btree_key_cache_create(struct bch_fs *c, + enum btree_id btree_id, + struct bpos pos) +{ ++ struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck; + bool was_new = true; + -+ ck = bkey_cached_alloc(c); ++ ck = bkey_cached_alloc(bc); + + if (unlikely(!ck)) { -+ ck = bkey_cached_reuse(c); -+ if (unlikely(!ck)) ++ ck = bkey_cached_reuse(bc); ++ if (unlikely(!ck)) { ++ bch_err(c, "error allocating memory for key cache item, btree %s", ++ bch2_btree_ids[btree_id]); + return ERR_PTR(-ENOMEM); ++ } + + was_new = false; ++ } else { ++ if (btree_id == BTREE_ID_subvolumes) ++ six_lock_pcpu_alloc(&ck->c.lock); ++ else ++ six_lock_pcpu_free(&ck->c.lock); + } + -+ if (btree_id == BTREE_ID_subvolumes) -+ six_lock_pcpu_alloc(&ck->c.lock); -+ else -+ six_lock_pcpu_free(&ck->c.lock); -+ + ck->c.level = 0; + ck->c.btree_id = btree_id; + ck->key.btree_id = btree_id; @@ -20951,7 +22222,7 @@ index 000000000000..4f1bc1d165aa + ck->valid = false; + ck->flags = 1U << BKEY_CACHED_ACCESSED; + -+ if (unlikely(rhashtable_lookup_insert_fast(&c->table, ++ if (unlikely(rhashtable_lookup_insert_fast(&bc->table, + &ck->hash, + bch2_btree_key_cache_params))) { + /* We raced with another fill: */ @@ -20961,15 +22232,15 @@ index 000000000000..4f1bc1d165aa + six_unlock_intent(&ck->c.lock); + kfree(ck); + } else { -+ mutex_lock(&c->lock); -+ bkey_cached_free(c, ck); -+ mutex_unlock(&c->lock); ++ mutex_lock(&bc->lock); ++ bkey_cached_free(bc, ck); ++ mutex_unlock(&bc->lock); + } + + return NULL; + } + -+ atomic_long_inc(&c->nr_keys); ++ atomic_long_inc(&bc->nr_keys); + + six_unlock_write(&ck->c.lock); + @@ -20980,21 +22251,24 @@ index 000000000000..4f1bc1d165aa + struct btree_path *ck_path, + struct bkey_cached *ck) +{ -+ struct btree_iter iter; ++ struct btree_path *path; + struct bkey_s_c k; + unsigned new_u64s = 0; + struct bkey_i *new_k = NULL; ++ struct bkey u; + int ret; + -+ bch2_trans_iter_init(trans, &iter, ck->key.btree_id, -+ ck->key.pos, BTREE_ITER_SLOTS); -+ k = bch2_btree_iter_peek_slot(&iter); -+ ret = bkey_err(k); ++ path = bch2_path_get(trans, ck->key.btree_id, ++ ck->key.pos, 0, 0, 0, _THIS_IP_); ++ ret = bch2_btree_path_traverse(trans, path, 0); + if (ret) + goto err; + ++ k = bch2_btree_path_peek_slot(path, &u); ++ + if (!bch2_btree_node_relock(trans, ck_path, 0)) { -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ trace_trans_restart_relock_key_cache_fill(trans->fn, ++ _THIS_IP_, ck_path->btree_id, &ck_path->pos); + ret = btree_trans_restart(trans); + goto err; + } @@ -21009,6 +22283,8 @@ index 000000000000..4f1bc1d165aa + new_u64s = roundup_pow_of_two(new_u64s); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) { ++ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", ++ bch2_btree_ids[ck->key.btree_id], new_u64s); + ret = -ENOMEM; + goto err; + } @@ -21030,9 +22306,9 @@ index 000000000000..4f1bc1d165aa + bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); + + /* We're not likely to need this iterator again: */ -+ set_btree_iter_dontneed(&iter); ++ path->preserve = false; +err: -+ bch2_trans_iter_exit(trans, &iter); ++ bch2_path_put(trans, path, 0); + return ret; +} + @@ -21069,15 +22345,14 @@ index 000000000000..4f1bc1d165aa + return 0; + } + -+ ck = btree_key_cache_create(&c->btree_key_cache, -+ path->btree_id, path->pos); ++ ck = btree_key_cache_create(c, path->btree_id, path->pos); + ret = PTR_ERR_OR_ZERO(ck); + if (ret) + goto err; + if (!ck) + goto retry; + -+ mark_btree_node_locked(path, 0, SIX_LOCK_intent); ++ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + path->locks_want = 1; + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); @@ -21088,7 +22363,6 @@ index 000000000000..4f1bc1d165aa + if (!trans->restarted) + goto retry; + -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); + ret = -EINTR; + goto err; + } @@ -21099,7 +22373,7 @@ index 000000000000..4f1bc1d165aa + goto retry; + } + -+ mark_btree_node_locked(path, 0, lock_want); ++ mark_btree_node_locked(trans, path, 0, lock_want); + } + + path->l[0].lock_seq = ck->c.lock.state.seq; @@ -21108,7 +22382,7 @@ index 000000000000..4f1bc1d165aa + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { + if (!path->locks_want && + !__bch2_btree_path_upgrade(trans, path, 1)) { -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ trace_transaction_restart_ip(trans->fn, _THIS_IP_); + ret = btree_trans_restart(trans); + goto err; + } @@ -21154,21 +22428,27 @@ index 000000000000..4f1bc1d165aa + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_CACHED_NOCREATE| + BTREE_ITER_INTENT); ++ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; ++ + ret = bch2_btree_iter_traverse(&c_iter); + if (ret) + goto out; + + ck = (void *) c_iter.path->l[0].b; -+ if (!ck || -+ (journal_seq && ck->journal.seq != journal_seq)) ++ if (!ck) + goto out; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ if (!evict) -+ goto out; -+ goto evict; ++ if (evict) ++ goto evict; ++ goto out; + } + ++ BUG_ON(!ck->valid); ++ ++ if (journal_seq && ck->journal.seq != journal_seq) ++ goto out; ++ + /* + * Since journal reclaim depends on us making progress here, and the + * allocator/copygc depend on journal reclaim making progress, we need @@ -21176,6 +22456,7 @@ index 000000000000..4f1bc1d165aa + * */ + ret = bch2_btree_iter_traverse(&b_iter) ?: + bch2_trans_update(trans, &b_iter, ck->k, ++ BTREE_UPDATE_KEY_CACHE_RECLAIM| + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, @@ -21183,7 +22464,7 @@ index 000000000000..4f1bc1d165aa + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + (ck->journal.seq == journal_last_seq(j) -+ ? BTREE_INSERT_JOURNAL_RESERVED ++ ? JOURNAL_WATERMARK_reserved + : 0)| + commit_flags); + if (ret) { @@ -21317,14 +22598,6 @@ index 000000000000..4f1bc1d165aa + return true; +} + -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) -+{ -+ BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); -+} -+#endif -+ +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ @@ -21438,11 +22711,12 @@ index 000000000000..4f1bc1d165aa + + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); -+ for (i = 0; i < tbl->size; i++) -+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { -+ bkey_cached_evict(bc, ck); -+ list_add(&ck->list, &bc->freed); -+ } ++ if (tbl) ++ for (i = 0; i < tbl->size; i++) ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ bkey_cached_evict(bc, ck); ++ list_add(&ck->list, &bc->freed); ++ } + rcu_read_unlock(); + + list_for_each_entry_safe(ck, n, &bc->freed, list) { @@ -21512,10 +22786,10 @@ index 000000000000..4f1bc1d165aa +} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h new file mode 100644 -index 000000000000..0768ef3ca776 +index 000000000000..fd29c14c5626 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.h -@@ -0,0 +1,54 @@ +@@ -0,0 +1,45 @@ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_H +#define _BCACHEFS_BTREE_KEY_CACHE_H + @@ -21534,8 +22808,7 @@ index 000000000000..0768ef3ca776 + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + -+ return nr_dirty > max_dirty && -+ test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); ++ return nr_dirty > max_dirty; +} + +int bch2_btree_key_cache_journal_flush(struct journal *, @@ -21551,14 +22824,6 @@ index 000000000000..0768ef3ca776 + struct btree_path *, struct bkey_i *); +int bch2_btree_key_cache_flush(struct btree_trans *, + enum btree_id, struct bpos); -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_key_cache_verify_clean(struct btree_trans *, -+ enum btree_id, struct bpos); -+#else -+static inline void -+bch2_btree_key_cache_verify_clean(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) {} -+#endif + +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); @@ -21572,10 +22837,10 @@ index 000000000000..0768ef3ca776 +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 -index 000000000000..d599008c5fc1 +index 000000000000..67c970d727ac --- /dev/null +++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,243 @@ +@@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H @@ -21636,7 +22901,8 @@ index 000000000000..d599008c5fc1 + path->nodes_intent_locked &= ~(1 << level); +} + -+static inline void mark_btree_node_locked(struct btree_path *path, ++static inline void mark_btree_node_locked(struct btree_trans *trans, ++ struct btree_path *path, + unsigned level, + enum six_lock_type type) +{ @@ -21644,14 +22910,17 @@ index 000000000000..d599008c5fc1 + BUILD_BUG_ON(SIX_LOCK_read != 0); + BUILD_BUG_ON(SIX_LOCK_intent != 1); + ++ BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx); ++ + path->nodes_locked |= 1 << level; + path->nodes_intent_locked |= type << level; +} + -+static inline void mark_btree_node_intent_locked(struct btree_path *path, ++static inline void mark_btree_node_intent_locked(struct btree_trans *trans, ++ struct btree_path *path, + unsigned level) +{ -+ mark_btree_node_locked(path, level, SIX_LOCK_intent); ++ mark_btree_node_locked(trans, path, level, SIX_LOCK_intent); +} + +static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) @@ -21706,23 +22975,35 @@ index 000000000000..d599008c5fc1 + } +} + -+/* -+ * wrapper around six locks that just traces lock contended time -+ */ -+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, -+ enum six_lock_type type) ++static inline bool btree_node_lock_type(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct bpos pos, unsigned level, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) +{ -+ u64 start_time = local_clock(); ++ struct bch_fs *c = trans->c; ++ u64 start_time; ++ bool ret; + -+ six_lock_type(&b->c.lock, type, NULL, NULL); -+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); -+} ++ if (six_trylock_type(&b->c.lock, type)) ++ return true; + -+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, -+ enum six_lock_type type) -+{ -+ if (!six_trylock_type(&b->c.lock, type)) -+ __btree_node_lock_type(c, b, type); ++ start_time = local_clock(); ++ ++ trans->locking_path_idx = path->idx; ++ trans->locking_pos = pos; ++ trans->locking_btree_id = path->btree_id; ++ trans->locking_level = level; ++ trans->locking_lock_type = type; ++ trans->locking = b; ++ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; ++ trans->locking = NULL; ++ ++ if (ret) ++ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++ ++ return ret; +} + +/* @@ -21821,10 +23102,10 @@ index 000000000000..d599008c5fc1 + diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 -index 000000000000..0d0a719f738f +index 000000000000..3438e089dba0 --- /dev/null +++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,700 @@ +@@ -0,0 +1,713 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H @@ -21835,6 +23116,7 @@ index 000000000000..0d0a719f738f + +#include "bkey_methods.h" +#include "buckets_types.h" ++#include "darray.h" +#include "journal_types.h" + +struct open_bucket; @@ -21979,7 +23261,8 @@ index 000000000000..0d0a719f738f + struct mutex lock; + struct list_head live; + struct list_head freeable; -+ struct list_head freed; ++ struct list_head freed_pcpu; ++ struct list_head freed_nonpcpu; + + /* Number of elements in live + freeable lists */ + unsigned used; @@ -22029,14 +23312,16 @@ index 000000000000..0d0a719f738f + */ +#define BTREE_ITER_IS_EXTENTS (1 << 4) +#define BTREE_ITER_NOT_EXTENTS (1 << 5) -+#define BTREE_ITER_ERROR (1 << 6) -+#define BTREE_ITER_CACHED (1 << 7) -+#define BTREE_ITER_CACHED_NOFILL (1 << 8) -+#define BTREE_ITER_CACHED_NOCREATE (1 << 9) ++#define BTREE_ITER_CACHED (1 << 6) ++#define BTREE_ITER_CACHED_NOFILL (1 << 7) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 8) ++#define BTREE_ITER_WITH_KEY_CACHE (1 << 9) +#define BTREE_ITER_WITH_UPDATES (1 << 10) -+#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) -+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) -+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) ++#define BTREE_ITER_WITH_JOURNAL (1 << 11) ++#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) ++#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) ++#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) ++#define BTREE_ITER_NOPRESERVE (1 << 15) + +enum btree_path_uptodate { + BTREE_ITER_UPTODATE = 0, @@ -22101,6 +23386,8 @@ index 000000000000..0d0a719f738f +struct btree_iter { + struct btree_trans *trans; + struct btree_path *path; ++ struct btree_path *update_path; ++ struct btree_path *key_cache_path; + + enum btree_id btree_id:4; + unsigned min_depth:4; @@ -22118,6 +23405,9 @@ index 000000000000..0d0a719f738f + * bch2_btree_iter_next_slot() can correctly advance pos. + */ + struct bkey k; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned long ip_allocated; ++#endif +}; + +struct btree_key_cache { @@ -22145,7 +23435,7 @@ index 000000000000..0d0a719f738f + struct btree_bkey_cached_common c; + + unsigned long flags; -+ u8 u64s; ++ u16 u64s; + bool valid; + u32 btree_trans_barrier_seq; + struct bkey_cached_key key; @@ -22163,12 +23453,20 @@ index 000000000000..0d0a719f738f + unsigned flags; + u8 bkey_type; + enum btree_id btree_id:8; -+ u8 level; ++ u8 level:4; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; ++ /* ++ * @old_k may be a key from the journal; @old_btree_u64s always refers ++ * to the size of the key being overwritten in the btree: ++ */ ++ u8 old_btree_u64s; + struct bkey_i *k; + struct btree_path *path; ++ /* key being overwritten: */ ++ struct bkey old_k; ++ const struct bch_val *old_v; + unsigned long ip_allocated; +}; + @@ -22190,23 +23488,26 @@ index 000000000000..0d0a719f738f + +struct btree_trans { + struct bch_fs *c; -+#ifdef CONFIG_BCACHEFS_DEBUG ++ const char *fn; + struct list_head list; + struct btree *locking; + unsigned locking_path_idx; + struct bpos locking_pos; + u8 locking_btree_id; + u8 locking_level; ++ u8 locking_lock_type; + pid_t pid; -+#endif -+ unsigned long ip; + int srcu_idx; + + u8 nr_sorted; + u8 nr_updates; ++ u8 traverse_all_idx; + bool used_mempool:1; + bool in_traverse_all:1; + bool restarted:1; ++ bool memory_allocation_failure:1; ++ bool journal_transaction_names:1; ++ bool is_initial_gc:1; + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: @@ -22225,8 +23526,7 @@ index 000000000000..0d0a719f738f + + /* update path: */ + struct btree_trans_commit_hook *hooks; -+ struct jset_entry *extra_journal_entries; -+ unsigned extra_journal_entry_u64s; ++ DARRAY(u64) extra_journal_entries; + struct journal_entry_pin *journal_pin; + + struct journal_res journal_res; @@ -22239,7 +23539,31 @@ index 000000000000..0d0a719f738f + struct replicas_delta_list *fs_usage_deltas; +}; + -+#define BTREE_FLAG(flag) \ ++#define BTREE_FLAGS() \ ++ x(read_in_flight) \ ++ x(read_error) \ ++ x(dirty) \ ++ x(need_write) \ ++ x(write_blocked) \ ++ x(will_make_reachable) \ ++ x(noevict) \ ++ x(write_idx) \ ++ x(accessed) \ ++ x(write_in_flight) \ ++ x(write_in_flight_inner) \ ++ x(just_written) \ ++ x(dying) \ ++ x(fake) \ ++ x(need_rewrite) \ ++ x(never_write) ++ ++enum btree_flags { ++#define x(flag) BTREE_NODE_##flag, ++ BTREE_FLAGS() ++#undef x ++}; ++ ++#define x(flag) \ +static inline bool btree_node_ ## flag(struct btree *b) \ +{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ @@ -22249,36 +23573,8 @@ index 000000000000..0d0a719f738f +static inline void clear_btree_node_ ## flag(struct btree *b) \ +{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } + -+enum btree_flags { -+ BTREE_NODE_read_in_flight, -+ BTREE_NODE_read_error, -+ BTREE_NODE_dirty, -+ BTREE_NODE_need_write, -+ BTREE_NODE_noevict, -+ BTREE_NODE_write_idx, -+ BTREE_NODE_accessed, -+ BTREE_NODE_write_in_flight, -+ BTREE_NODE_write_in_flight_inner, -+ BTREE_NODE_just_written, -+ BTREE_NODE_dying, -+ BTREE_NODE_fake, -+ BTREE_NODE_need_rewrite, -+ BTREE_NODE_never_write, -+}; -+ -+BTREE_FLAG(read_in_flight); -+BTREE_FLAG(read_error); -+BTREE_FLAG(need_write); -+BTREE_FLAG(noevict); -+BTREE_FLAG(write_idx); -+BTREE_FLAG(accessed); -+BTREE_FLAG(write_in_flight); -+BTREE_FLAG(write_in_flight_inner); -+BTREE_FLAG(just_written); -+BTREE_FLAG(dying); -+BTREE_FLAG(fake); -+BTREE_FLAG(need_rewrite); -+BTREE_FLAG(never_write); ++BTREE_FLAGS() ++#undef x + +static inline struct btree_write *btree_current_write(struct btree *b) +{ @@ -22408,24 +23704,9 @@ index 000000000000..0d0a719f738f + return __btree_node_type(b->c.level, b->c.btree_id); +} + -+static inline bool btree_node_type_is_extents(enum btree_node_type type) -+{ -+ switch (type) { -+ case BKEY_TYPE_extents: -+ case BKEY_TYPE_reflink: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool btree_node_is_extents(struct btree *b) -+{ -+ return btree_node_type_is_extents(btree_node_type(b)); -+} -+ +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ ++ (1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_reflink)| \ @@ -22441,6 +23722,16 @@ index 000000000000..0d0a719f738f + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + ++#define BTREE_ID_IS_EXTENTS \ ++ ((1U << BTREE_ID_extents)| \ ++ (1U << BTREE_ID_reflink)| \ ++ (1U << BTREE_ID_freespace)) ++ ++static inline bool btree_node_type_is_extents(enum btree_node_type type) ++{ ++ return (1U << type) & BTREE_ID_IS_EXTENTS; ++} ++ +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ @@ -22458,6 +23749,7 @@ index 000000000000..0d0a719f738f + +enum btree_update_flags { + __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, ++ __BTREE_UPDATE_KEY_CACHE_RECLAIM, + + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + @@ -22470,6 +23762,7 @@ index 000000000000..0d0a719f738f +}; + +#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ++#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) + +#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) + @@ -22484,6 +23777,7 @@ index 000000000000..0d0a719f738f + ((1U << KEY_TYPE_alloc)| \ + (1U << KEY_TYPE_alloc_v2)| \ + (1U << KEY_TYPE_alloc_v3)| \ ++ (1U << KEY_TYPE_alloc_v4)| \ + (1U << KEY_TYPE_stripe)| \ + (1U << KEY_TYPE_inode)| \ + (1U << KEY_TYPE_inode_v2)| \ @@ -22527,10 +23821,10 @@ index 000000000000..0d0a719f738f +#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h new file mode 100644 -index 000000000000..0268dd74f0ab +index 000000000000..ad13b0739a68 --- /dev/null +++ b/fs/bcachefs/btree_update.h -@@ -0,0 +1,155 @@ +@@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_H +#define _BCACHEFS_BTREE_UPDATE_H @@ -22549,12 +23843,12 @@ index 000000000000..0268dd74f0ab +void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); + +enum btree_insert_flags { -+ __BTREE_INSERT_NOFAIL, ++ /* First two bits for journal watermark: */ ++ __BTREE_INSERT_NOFAIL = 2, + __BTREE_INSERT_NOCHECK_RW, + __BTREE_INSERT_LAZY_RW, + __BTREE_INSERT_USE_RESERVE, + __BTREE_INSERT_JOURNAL_REPLAY, -+ __BTREE_INSERT_JOURNAL_RESERVED, + __BTREE_INSERT_JOURNAL_RECLAIM, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, @@ -22574,9 +23868,6 @@ index 000000000000..0268dd74f0ab +/* Insert is for journal replay - don't get journal reservations: */ +#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) + -+/* Indicates that we have pre-reserved space in the journal: */ -+#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -+ +/* Insert is being called from journal reclaim path: */ +#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) + @@ -22596,7 +23887,7 @@ index 000000000000..0268dd74f0ab +int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + struct bpos, struct bpos, unsigned, u64 *); +int bch2_btree_delete_range(struct bch_fs *, enum btree_id, -+ struct bpos, struct bpos, u64 *); ++ struct bpos, struct bpos, unsigned, u64 *); + +int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + struct btree *, unsigned); @@ -22606,12 +23897,18 @@ index 000000000000..0268dd74f0ab +int bch2_btree_node_update_key_get_iter(struct btree_trans *, + struct btree *, struct bkey_i *, bool); + -+int bch2_trans_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, enum btree_update_flags); ++int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_update_flags); ++ ++int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_update_flags); ++ +void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); +int __bch2_trans_commit(struct btree_trans *); + ++int bch2_trans_log_msg(struct btree_trans *, const char *); ++ +/** + * bch2_trans_commit - insert keys at given iterator positions + * @@ -22668,30 +23965,13 @@ index 000000000000..0268dd74f0ab + (_i) < (_trans)->updates + (_trans)->nr_updates; \ + (_i)++) + -+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos pos) -+{ -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) -+ if ((cmp_int(btree_id, i->btree_id) ?: -+ bpos_cmp(pos, i->k->k.p)) <= 0) { -+ if (btree_id == i->btree_id) -+ return i->k; -+ break; -+ } -+ -+ return NULL; -+} -+ +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 -index 000000000000..61c7757bd3ca +index 000000000000..42ae3b0c5839 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2187 @@ +@@ -0,0 +1,2238 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -22710,6 +23990,7 @@ index 000000000000..61c7757bd3ca +#include "journal.h" +#include "journal_reclaim.h" +#include "keylist.h" ++#include "recovery.h" +#include "replicas.h" +#include "super-io.h" + @@ -22734,11 +24015,11 @@ index 000000000000..61c7757bd3ca + struct bkey_s_c k; + struct bkey_s_c_btree_ptr_v2 bp; + struct bkey unpacked; -+ char buf1[100], buf2[100]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + BUG_ON(!b->c.level); + -+ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; + + bch2_btree_node_iter_init_from_start(&iter, b); @@ -22751,9 +24032,9 @@ index 000000000000..61c7757bd3ca + + if (bpos_cmp(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); -+ panic("expected next min_key %s got %s\n", -+ (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), -+ (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2)); ++ bch2_bpos_to_text(&buf1, next_node); ++ bch2_bpos_to_text(&buf2, bp.v->min_key); ++ panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); + } + + bch2_btree_node_iter_advance(&iter, b); @@ -22761,9 +24042,9 @@ index 000000000000..61c7757bd3ca + if (bch2_btree_node_iter_end(&iter)) { + if (bpos_cmp(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); -+ panic("expected end %s got %s\n", -+ (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), -+ (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2)); ++ bch2_bpos_to_text(&buf1, b->key.k.p); ++ bch2_bpos_to_text(&buf2, k.k->p); ++ panic("expected end %s got %s\n", buf1.buf, buf2.buf); + } + break; + } @@ -22874,6 +24155,7 @@ index 000000000000..61c7757bd3ca +static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, + struct disk_reservation *res, + struct closure *cl, ++ bool interior_node, + unsigned flags) +{ + struct write_point *wp; @@ -22886,10 +24168,10 @@ index 000000000000..61c7757bd3ca + + if (flags & BTREE_INSERT_USE_RESERVE) { + nr_reserve = 0; -+ alloc_reserve = RESERVE_BTREE_MOVINGGC; ++ alloc_reserve = RESERVE_btree_movinggc; + } else { + nr_reserve = BTREE_NODE_RESERVE; -+ alloc_reserve = RESERVE_BTREE; ++ alloc_reserve = RESERVE_btree; + } + + mutex_lock(&c->btree_reserve_cache_lock); @@ -22917,12 +24199,12 @@ index 000000000000..61c7757bd3ca + if (IS_ERR(wp)) + return ERR_CAST(wp); + -+ if (wp->sectors_free < c->opts.btree_node_size) { ++ if (wp->sectors_free < btree_sectors(c)) { + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ob->sectors_free < c->opts.btree_node_size) ++ if (ob->sectors_free < btree_sectors(c)) + ob->sectors_free = 0; + + bch2_alloc_sectors_done(c, wp); @@ -22930,12 +24212,14 @@ index 000000000000..61c7757bd3ca + } + + bkey_btree_ptr_v2_init(&tmp.k); -+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); ++ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); + + bch2_open_bucket_get(c, wp, &ob); + bch2_alloc_sectors_done(c, wp); +mem_alloc: -+ b = bch2_btree_node_mem_alloc(c); ++ b = bch2_btree_node_mem_alloc(c, interior_node); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + + /* we hold cannibalize_lock: */ + BUG_ON(IS_ERR(b)); @@ -22951,15 +24235,19 @@ index 000000000000..61c7757bd3ca +{ + struct bch_fs *c = as->c; + struct btree *b; ++ struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; + int ret; + + BUG_ON(level >= BTREE_MAX_DEPTH); -+ BUG_ON(!as->nr_prealloc_nodes); ++ BUG_ON(!p->nr); + -+ b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ b = p->b[--p->nr]; ++ ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); + + set_btree_node_accessed(b); -+ set_btree_node_dirty(c, b); ++ set_btree_node_dirty_acct(c, b); + set_btree_node_need_write(b); + + bch2_bset_init_first(b, &b->data->keys); @@ -23065,70 +24353,94 @@ index 000000000000..61c7757bd3ca +static void bch2_btree_reserve_put(struct btree_update *as) +{ + struct bch_fs *c = as->c; ++ struct prealloc_nodes *p; + + mutex_lock(&c->btree_reserve_cache_lock); + -+ while (as->nr_prealloc_nodes) { -+ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ for (p = as->prealloc_nodes; ++ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); ++ p++) { ++ while (p->nr) { ++ struct btree *b = p->b[--p->nr]; + -+ six_unlock_write(&b->c.lock); ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); + -+ if (c->btree_reserve_cache_nr < -+ ARRAY_SIZE(c->btree_reserve_cache)) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; ++ if (c->btree_reserve_cache_nr < ++ ARRAY_SIZE(c->btree_reserve_cache)) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; + -+ a->ob = b->ob; -+ b->ob.nr = 0; -+ bkey_copy(&a->k, &b->key); -+ } else { -+ bch2_open_buckets_put(c, &b->ob); ++ a->ob = b->ob; ++ b->ob.nr = 0; ++ bkey_copy(&a->k, &b->key); ++ } else { ++ bch2_open_buckets_put(c, &b->ob); ++ } ++ ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); + } -+ -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ -+ six_unlock_intent(&b->c.lock); + } + + mutex_unlock(&c->btree_reserve_cache_lock); +} + -+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, -+ unsigned flags, struct closure *cl) ++static int bch2_btree_reserve_get(struct btree_update *as, ++ unsigned nr_nodes[2], ++ unsigned flags) +{ + struct bch_fs *c = as->c; ++ struct closure cl; + struct btree *b; ++ unsigned interior; + int ret; + -+ BUG_ON(nr_nodes > BTREE_RESERVE_MAX); ++ closure_init_stack(&cl); ++retry: ++ ++ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); + + /* + * Protects reaping from the btree node cache and using the btree node + * open bucket reserve: ++ * ++ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not ++ * blocking on this lock: + */ -+ ret = bch2_btree_cache_cannibalize_lock(c, cl); ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); + if (ret) -+ return ret; ++ goto err; + -+ while (as->nr_prealloc_nodes < nr_nodes) { -+ b = __bch2_btree_node_alloc(c, &as->disk_res, -+ flags & BTREE_INSERT_NOWAIT -+ ? NULL : cl, flags); -+ if (IS_ERR(b)) { -+ ret = PTR_ERR(b); -+ goto err_free; ++ for (interior = 0; interior < 2; interior++) { ++ struct prealloc_nodes *p = as->prealloc_nodes + interior; ++ ++ while (p->nr < nr_nodes[interior]) { ++ b = __bch2_btree_node_alloc(c, &as->disk_res, ++ flags & BTREE_INSERT_NOWAIT ++ ? NULL : &cl, ++ interior, flags); ++ if (IS_ERR(b)) { ++ ret = PTR_ERR(b); ++ goto err; ++ } ++ ++ p->b[p->nr++] = b; + } -+ -+ as->prealloc_nodes[as->nr_prealloc_nodes++] = b; + } + + bch2_btree_cache_cannibalize_unlock(c); ++ closure_sync(&cl); + return 0; -+err_free: ++err: + bch2_btree_cache_cannibalize_unlock(c); -+ trace_btree_reserve_get_fail(c, nr_nodes, cl); ++ closure_sync(&cl); ++ ++ if (ret == -EAGAIN) ++ goto retry; ++ ++ trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], &cl); + return ret; +} + @@ -23149,15 +24461,23 @@ index 000000000000..61c7757bd3ca + bch2_disk_reservation_put(c, &as->disk_res); + bch2_btree_reserve_put(as); + ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], ++ as->start_time); ++ + mutex_lock(&c->btree_interior_update_lock); + list_del(&as->unwritten_list); + list_del(&as->list); -+ mutex_unlock(&c->btree_interior_update_lock); + + closure_debug_destroy(&as->cl); + mempool_free(as, &c->btree_interior_update_pool); + ++ /* ++ * Have to do the wakeup with btree_interior_update_lock still held, ++ * since being on btree_interior_update_list is our ref on @c: ++ */ + closure_wake_up(&c->btree_interior_update_wait); ++ ++ mutex_unlock(&c->btree_interior_update_lock); +} + +static void btree_update_will_delete_key(struct btree_update *as, @@ -23186,24 +24506,25 @@ index 000000000000..61c7757bd3ca + struct bkey_i *k; + int ret; + -+ trans->extra_journal_entries = (void *) &as->journal_entries[0]; -+ trans->extra_journal_entry_u64s = as->journal_u64s; ++ ret = darray_make_room(trans->extra_journal_entries, as->journal_u64s); ++ if (ret) ++ return ret; ++ ++ memcpy(&darray_top(trans->extra_journal_entries), ++ as->journal_entries, ++ as->journal_u64s * sizeof(u64)); ++ trans->extra_journal_entries.nr += as->journal_u64s; ++ + trans->journal_pin = &as->journal; + + for_each_keylist_key(&as->new_keys, k) { -+ ret = bch2_trans_mark_key(trans, -+ bkey_s_c_null, -+ bkey_i_to_s_c(k), -+ BTREE_TRIGGER_INSERT); ++ ret = bch2_trans_mark_new(trans, k, 0); + if (ret) + return ret; + } + + for_each_keylist_key(&as->old_keys, k) { -+ ret = bch2_trans_mark_key(trans, -+ bkey_i_to_s_c(k), -+ bkey_s_c_null, -+ BTREE_TRIGGER_OVERWRITE); ++ ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0); + if (ret) + return ret; + } @@ -23231,8 +24552,6 @@ index 000000000000..61c7757bd3ca + if (ret) + goto err; + -+ BUG_ON(!journal_pin_active(&as->journal)); -+ + /* + * Wait for any in flight writes to finish before we free the old nodes + * on disk: @@ -23268,7 +24587,7 @@ index 000000000000..61c7757bd3ca + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| -+ BTREE_INSERT_JOURNAL_RESERVED, ++ JOURNAL_WATERMARK_reserved, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_exit(&trans); + @@ -23288,11 +24607,13 @@ index 000000000000..61c7757bd3ca + * we're in journal error state: + */ + -+ btree_node_lock_type(c, b, SIX_LOCK_intent); -+ btree_node_lock_type(c, b, SIX_LOCK_write); ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); + mutex_lock(&c->btree_interior_update_lock); + + list_del(&as->write_blocked_list); ++ if (list_empty(&b->write_blocked)) ++ clear_btree_node_write_blocked(b); + + /* + * Node might have been freed, recheck under @@ -23337,13 +24658,14 @@ index 000000000000..61c7757bd3ca + + BUG_ON(b->will_make_reachable != (unsigned long) as); + b->will_make_reachable = 0; ++ clear_btree_node_will_make_reachable(b); + } + mutex_unlock(&c->btree_interior_update_lock); + + for (i = 0; i < as->nr_new_nodes; i++) { + b = as->new_nodes[i]; + -+ btree_node_lock_type(c, b, SIX_LOCK_read); ++ six_lock_read(&b->c.lock, NULL, NULL); + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + } @@ -23403,6 +24725,8 @@ index 000000000000..61c7757bd3ca + + as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->b = b; ++ ++ set_btree_node_write_blocked(b); + list_add(&as->write_blocked_list, &b->write_blocked); + + mutex_unlock(&c->btree_interior_update_lock); @@ -23468,6 +24792,7 @@ index 000000000000..61c7757bd3ca + + as->new_nodes[as->nr_new_nodes++] = b; + b->will_make_reachable = 1UL|(unsigned long) as; ++ set_btree_node_will_make_reachable(b); + + mutex_unlock(&c->btree_interior_update_lock); + @@ -23490,6 +24815,7 @@ index 000000000000..61c7757bd3ca + * xchg() is for synchronization with bch2_btree_complete_write: + */ + v = xchg(&b->will_make_reachable, 0); ++ clear_btree_node_will_make_reachable(b); + as = (struct btree_update *) (v & ~1UL); + + if (!as) { @@ -23555,7 +24881,7 @@ index 000000000000..61c7757bd3ca + closure_wake_up(&c->btree_interior_update_wait); + } + -+ clear_btree_node_dirty(c, b); ++ clear_btree_node_dirty_acct(c, b); + clear_btree_node_need_write(b); + + /* @@ -23596,6 +24922,9 @@ index 000000000000..61c7757bd3ca + +static void bch2_btree_update_done(struct btree_update *as) +{ ++ struct bch_fs *c = as->c; ++ u64 start_time = as->start_time; ++ + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + + if (as->took_gc_lock) @@ -23606,34 +24935,50 @@ index 000000000000..61c7757bd3ca + + continue_at(&as->cl, btree_update_set_nodes_written, + as->c->btree_interior_update_worker); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], ++ start_time); +} + +static struct btree_update * +bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, -+ unsigned level, unsigned nr_nodes, unsigned flags) ++ unsigned level, bool split, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_update *as; -+ struct closure cl; ++ u64 start_time = local_clock(); + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; -+ int journal_flags = 0; ++ unsigned nr_nodes[2] = { 0, 0 }; ++ unsigned update_level = level; ++ int journal_flags = flags & JOURNAL_WATERMARK_MASK; + int ret = 0; + + BUG_ON(!path->should_be_locked); + -+ if (flags & BTREE_INSERT_JOURNAL_RESERVED) -+ journal_flags |= JOURNAL_RES_GET_RESERVED; ++ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) ++ journal_flags |= JOURNAL_RES_GET_NONBLOCK; + -+ closure_init_stack(&cl); -+retry: ++ while (1) { ++ nr_nodes[!!update_level] += 1 + split; ++ update_level++; ++ ++ if (!btree_path_node(path, update_level)) ++ break; ++ ++ /* ++ * XXX: figure out how far we might need to split, ++ * instead of locking/reserving all the way to the root: ++ */ ++ split = update_level + 1 < BTREE_MAX_DEPTH; ++ } ++ ++ /* Might have to allocate a new root: */ ++ if (update_level < BTREE_MAX_DEPTH) ++ nr_nodes[1] += 1; + -+ /* -+ * XXX: figure out how far we might need to split, -+ * instead of locking/reserving all the way to the root: -+ */ + if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { -+ trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_, ++ trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, + path->btree_id, &path->pos); + ret = btree_trans_restart(trans); + return ERR_PTR(ret); @@ -23654,6 +24999,7 @@ index 000000000000..61c7757bd3ca + memset(as, 0, sizeof(*as)); + closure_init(&as->cl, NULL); + as->c = c; ++ as->start_time = start_time; + as->mode = BTREE_INTERIOR_NO_UPDATE; + as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->btree_id = path->btree_id; @@ -23680,60 +25026,37 @@ index 000000000000..61c7757bd3ca + if (ret) + goto err; + ++ bch2_trans_unlock(trans); ++ + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, -+ journal_flags|JOURNAL_RES_GET_NONBLOCK); -+ if (ret == -EAGAIN) { -+ bch2_trans_unlock(trans); -+ -+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { -+ bch2_btree_update_free(as); -+ btree_trans_restart(trans); -+ return ERR_PTR(ret); -+ } -+ -+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags); -+ if (ret) { -+ trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_); -+ goto err; -+ } -+ -+ if (!bch2_trans_relock(trans)) { -+ ret = -EINTR; -+ goto err; -+ } ++ journal_flags); ++ if (ret) { ++ bch2_btree_update_free(as); ++ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); ++ btree_trans_restart(trans); ++ return ERR_PTR(ret); + } + + ret = bch2_disk_reservation_get(c, &as->disk_res, -+ nr_nodes * c->opts.btree_node_size, ++ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), + c->opts.metadata_replicas, + disk_res_flags); + if (ret) + goto err; + -+ ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); ++ ret = bch2_btree_reserve_get(as, nr_nodes, flags); + if (ret) + goto err; + -+ bch2_journal_pin_add(&c->journal, -+ atomic64_read(&c->journal.seq), -+ &as->journal, NULL); ++ if (!bch2_trans_relock(trans)) { ++ ret = -EINTR; ++ goto err; ++ } + + return as; +err: + bch2_btree_update_free(as); -+ -+ if (ret == -EAGAIN) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ ret = -EINTR; -+ } -+ -+ if (ret == -EINTR && bch2_trans_relock(trans)) -+ goto retry; -+ + return ERR_PTR(ret); +} + @@ -23783,8 +25106,7 @@ index 000000000000..61c7757bd3ca + struct btree *old; + + trace_btree_set_root(c, b); -+ BUG_ON(!b->written && -+ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); ++ BUG_ON(!b->written); + + old = btree_node_root(c, b); + @@ -23824,13 +25146,17 @@ index 000000000000..61c7757bd3ca + BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && + !btree_ptr_sectors_written(insert)); + ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) ++ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); ++ + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); + if (invalid) { -+ char buf[160]; ++ struct printbuf buf = PRINTBUF; + -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); -+ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); ++ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf.buf, invalid); ++ printbuf_exit(&buf); + dump_stack(); + } + @@ -23848,7 +25174,7 @@ index 000000000000..61c7757bd3ca + bch2_btree_node_iter_advance(node_iter, b); + + bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); -+ set_btree_node_dirty(c, b); ++ set_btree_node_dirty_acct(c, b); + set_btree_node_need_write(b); +} + @@ -24069,8 +25395,8 @@ index 000000000000..61c7757bd3ca + six_unlock_write(&n2->c.lock); + six_unlock_write(&n1->c.lock); + -+ bch2_btree_node_write(c, n1, SIX_LOCK_intent); -+ bch2_btree_node_write(c, n2, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); ++ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); + + /* + * Note that on recursive parent_keys == keys, so we @@ -24089,7 +25415,7 @@ index 000000000000..61c7757bd3ca + + btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); + -+ bch2_btree_node_write(c, n3, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); + } + } else { + trace_btree_compact(c, b); @@ -24097,7 +25423,7 @@ index 000000000000..61c7757bd3ca + bch2_btree_build_aux_trees(n1); + six_unlock_write(&n1->c.lock); + -+ bch2_btree_node_write(c, n1, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); + + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); @@ -24146,7 +25472,9 @@ index 000000000000..61c7757bd3ca + + bch2_trans_verify_locks(trans); + -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], ++ bch2_time_stats_update(&c->times[n2 ++ ? BCH_TIME_btree_node_split ++ : BCH_TIME_btree_node_compact], + start_time); +} + @@ -24232,14 +25560,13 @@ index 000000000000..61c7757bd3ca + struct btree_path *path, + unsigned flags) +{ -+ struct bch_fs *c = trans->c; + struct btree *b = path_l(path)->b; + struct btree_update *as; + unsigned l; + int ret = 0; + + as = bch2_btree_update_start(trans, path, path->level, -+ btree_update_reserve_required(c, b), flags); ++ true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + @@ -24267,6 +25594,7 @@ index 000000000000..61c7757bd3ca + struct btree *b, *m, *n, *prev, *next, *parent; + struct bpos sib_pos; + size_t sib_u64s; ++ u64 start_time = local_clock(); + int ret = 0; + + BUG_ON(!path->should_be_locked); @@ -24284,8 +25612,8 @@ index 000000000000..61c7757bd3ca + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); + -+ sib_path = bch2_path_get(trans, false, path->btree_id, -+ sib_pos, U8_MAX, level, true); ++ sib_path = bch2_path_get(trans, path->btree_id, sib_pos, ++ U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, sib_path, false); + if (ret) + goto err; @@ -24309,15 +25637,17 @@ index 000000000000..61c7757bd3ca + } + + if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) { -+ char buf1[100], buf2[100]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + -+ bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key); -+ bch2_bpos_to_text(&PBUF(buf2), next->data->min_key); ++ bch2_bpos_to_text(&buf1, prev->data->max_key); ++ bch2_bpos_to_text(&buf2, next->data->min_key); + bch_err(c, + "btree topology error in btree merge:\n" + " prev ends at %s\n" + " next starts at %s", -+ buf1, buf2); ++ buf1.buf, buf2.buf); ++ printbuf_exit(&buf1); ++ printbuf_exit(&buf2); + bch2_topology_error(c); + ret = -EIO; + goto err; @@ -24347,11 +25677,10 @@ index 000000000000..61c7757bd3ca + goto out; + + parent = btree_node_parent(path, b); -+ as = bch2_btree_update_start(trans, path, level, -+ btree_update_reserve_required(c, parent) + 1, -+ flags| ++ as = bch2_btree_update_start(trans, path, level, false, + BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE); ++ BTREE_INSERT_USE_RESERVE| ++ flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto err; @@ -24364,6 +25693,10 @@ index 000000000000..61c7757bd3ca + n = bch2_btree_node_alloc(as, b->c.level); + bch2_btree_update_add_new_node(as, n); + ++ SET_BTREE_NODE_SEQ(n->data, ++ max(BTREE_NODE_SEQ(b->data), ++ BTREE_NODE_SEQ(m->data)) + 1); ++ + btree_set_min(n, prev->data->min_key); + btree_set_max(n, next->data->max_key); + n->data->format = new_f; @@ -24376,7 +25709,7 @@ index 000000000000..61c7757bd3ca + bch2_btree_build_aux_trees(n); + six_unlock_write(&n->c.lock); + -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; @@ -24404,6 +25737,8 @@ index 000000000000..61c7757bd3ca + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); +out: +err: + bch2_path_put(trans, sib_path, true); @@ -24428,10 +25763,7 @@ index 000000000000..61c7757bd3ca + + parent = btree_node_parent(iter->path, b); + as = bch2_btree_update_start(trans, iter->path, b->c.level, -+ (parent -+ ? btree_update_reserve_required(c, parent) -+ : 0) + 1, -+ flags); ++ false, flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) { + trace_btree_gc_rewrite_node_fail(c, b); @@ -24448,7 +25780,7 @@ index 000000000000..61c7757bd3ca + + trace_btree_gc_rewrite_node(c, b); + -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + if (parent) { + bch2_keylist_add(&as->parent_keys, &n->key); @@ -24520,9 +25852,6 @@ index 000000000000..61c7757bd3ca +{ + struct async_btree_rewrite *a; + -+ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) -+ return; -+ + if (!percpu_ref_tryget(&c->writes)) + return; + @@ -24551,21 +25880,14 @@ index 000000000000..61c7757bd3ca + struct bch_fs *c = trans->c; + struct btree_iter iter2 = { NULL }; + struct btree *parent; -+ u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX]; + int ret; + + if (!skip_triggers) { -+ ret = bch2_trans_mark_key(trans, -+ bkey_s_c_null, -+ bkey_i_to_s_c(new_key), -+ BTREE_TRIGGER_INSERT); ++ ret = bch2_trans_mark_new(trans, new_key, 0); + if (ret) + return ret; + -+ ret = bch2_trans_mark_key(trans, -+ bkey_i_to_s_c(&b->key), -+ bkey_s_c_null, -+ BTREE_TRIGGER_OVERWRITE); ++ ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0); + if (ret) + return ret; + } @@ -24582,7 +25904,8 @@ index 000000000000..61c7757bd3ca + bch2_trans_copy_iter(&iter2, iter); + + iter2.path = bch2_btree_path_make_mut(trans, iter2.path, -+ iter2.flags & BTREE_ITER_INTENT); ++ iter2.flags & BTREE_ITER_INTENT, ++ _THIS_IP_); + + BUG_ON(iter2.path->level != b->c.level); + BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); @@ -24590,6 +25913,9 @@ index 000000000000..61c7757bd3ca + btree_node_unlock(iter2.path, iter2.path->level); + path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; + iter2.path->level++; ++ btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE); ++ ++ bch2_btree_path_check_sort(trans, iter2.path, 0); + + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); @@ -24598,19 +25924,24 @@ index 000000000000..61c7757bd3ca + } else { + BUG_ON(btree_node_root(c, b) != b); + -+ trans->extra_journal_entries = (void *) &journal_entries[0]; -+ trans->extra_journal_entry_u64s = -+ journal_entry_set((void *) &journal_entries[0], -+ BCH_JSET_ENTRY_btree_root, -+ b->c.btree_id, b->c.level, -+ new_key, new_key->k.u64s); ++ ret = darray_make_room(trans->extra_journal_entries, ++ jset_u64s(new_key->k.u64s)); ++ if (ret) ++ return ret; ++ ++ journal_entry_set((void *) &darray_top(trans->extra_journal_entries), ++ BCH_JSET_ENTRY_btree_root, ++ b->c.btree_id, b->c.level, ++ new_key, new_key->k.u64s); ++ trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); + } + + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_JOURNAL_RECLAIM| -+ BTREE_INSERT_JOURNAL_RESERVED); ++ JOURNAL_WATERMARK_reserved); + if (ret) + goto err; + @@ -24673,7 +26004,7 @@ index 000000000000..61c7757bd3ca + return -EINTR; + } + -+ new_hash = bch2_btree_node_mem_alloc(c); ++ new_hash = bch2_btree_node_mem_alloc(c, false); + } + + path->intent_ref++; @@ -24749,7 +26080,7 @@ index 000000000000..61c7757bd3ca + closure_sync(&cl); + } while (ret); + -+ b = bch2_btree_node_mem_alloc(c); ++ b = bch2_btree_node_mem_alloc(c, false); + bch2_btree_cache_cannibalize_unlock(c); + + set_btree_node_fake(b); @@ -24881,10 +26212,10 @@ index 000000000000..61c7757bd3ca +} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h new file mode 100644 -index 000000000000..8e03bd987d6d +index 000000000000..e72eb8795616 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.h -@@ -0,0 +1,318 @@ +@@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H +#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H @@ -24922,6 +26253,7 @@ index 000000000000..8e03bd987d6d +struct btree_update { + struct closure cl; + struct bch_fs *c; ++ u64 start_time; + + struct list_head list; + struct list_head unwritten_list; @@ -24962,18 +26294,20 @@ index 000000000000..8e03bd987d6d + struct journal_entry_pin journal; + + /* Preallocated nodes we reserve when we start the update: */ -+ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_prealloc_nodes; ++ struct prealloc_nodes { ++ struct btree *b[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr; ++ } prealloc_nodes[2]; + + /* Nodes being freed: */ + struct keylist old_keys; + u64 _old_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ BKEY_BTREE_PTR_U64s_MAX]; + + /* Nodes being added: */ + struct keylist new_keys; + u64 _new_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ BKEY_BTREE_PTR_U64s_MAX]; + + /* New nodes, that will be made reachable by this update: */ + struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; @@ -25104,7 +26438,7 @@ index 000000000000..8e03bd987d6d +{ + ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + + b->whiteout_u64s; -+ ssize_t total = c->opts.btree_node_size << 6; ++ ssize_t total = c->opts.btree_node_size >> 3; + + /* Always leave one extra u64 for bch2_varint_decode: */ + used++; @@ -25205,10 +26539,10 @@ index 000000000000..8e03bd987d6d +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c new file mode 100644 -index 000000000000..112ac7caf579 +index 000000000000..a0480c63dd81 --- /dev/null +++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,1518 @@ +@@ -0,0 +1,1756 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -25226,6 +26560,7 @@ index 000000000000..112ac7caf579 +#include "journal.h" +#include "journal_reclaim.h" +#include "keylist.h" ++#include "recovery.h" +#include "subvolume.h" +#include "replicas.h" + @@ -25233,6 +26568,10 @@ index 000000000000..112ac7caf579 +#include +#include + ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, ++ struct bkey_i *, enum btree_update_flags); ++ +static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + const struct btree_insert_entry *r) +{ @@ -25373,10 +26712,24 @@ index 000000000000..112ac7caf579 + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); ++ unsigned long old, new, v; ++ unsigned idx = w - b->writes; + -+ btree_node_lock_type(c, b, SIX_LOCK_read); -+ bch2_btree_node_write_cond(c, b, -+ (btree_current_write(b) == w && w->journal.seq == seq)); ++ six_lock_read(&b->c.lock, NULL, NULL); ++ v = READ_ONCE(b->flags); ++ ++ do { ++ old = new = v; ++ ++ if (!(old & (1 << BTREE_NODE_dirty)) || ++ !!(old & (1 << BTREE_NODE_write_idx)) != idx || ++ w->journal.seq != seq) ++ break; ++ ++ new |= 1 << BTREE_NODE_need_write; ++ } while ((v = cmpxchg(&b->flags, old, new)) != old); ++ ++ btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + return 0; +} @@ -25405,7 +26758,7 @@ index 000000000000..112ac7caf579 +/** + * btree_insert_key - insert a key one key into a leaf node + */ -+static bool btree_insert_key_leaf(struct btree_trans *trans, ++static void btree_insert_key_leaf(struct btree_trans *trans, + struct btree_insert_entry *insert) +{ + struct bch_fs *c = trans->c; @@ -25416,12 +26769,9 @@ index 000000000000..112ac7caf579 + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + -+ EBUG_ON(!insert->level && -+ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); -+ + if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, + &insert_l(insert)->iter, insert->k))) -+ return false; ++ return; + + i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, + le64_to_cpu(i->journal_seq))); @@ -25429,7 +26779,7 @@ index 000000000000..112ac7caf579 + bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); + + if (unlikely(!btree_node_dirty(b))) -+ set_btree_node_dirty(c, b); ++ set_btree_node_dirty_acct(c, b); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; @@ -25442,8 +26792,6 @@ index 000000000000..112ac7caf579 + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); -+ -+ return true; +} + +/* Cached btree updates: */ @@ -25479,7 +26827,7 @@ index 000000000000..112ac7caf579 + return ret; + + if (!bch2_trans_relock(trans)) { -+ trace_trans_restart_journal_preres_get(trans->ip, trace_ip); ++ trace_trans_restart_journal_preres_get(trans->fn, trace_ip); + return -EINTR; + } + @@ -25492,15 +26840,40 @@ index 000000000000..112ac7caf579 + struct bch_fs *c = trans->c; + int ret; + -+ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) -+ flags |= JOURNAL_RES_GET_RESERVED; -+ + ret = bch2_journal_res_get(&c->journal, &trans->journal_res, -+ trans->journal_u64s, flags); ++ trans->journal_u64s, ++ flags| ++ (trans->flags & JOURNAL_WATERMARK_MASK)); + + return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; +} + ++#define JSET_ENTRY_LOG_U64s 4 ++ ++static noinline void journal_transaction_name(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res); ++ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); ++ unsigned u64s = JSET_ENTRY_LOG_U64s - 1; ++ unsigned b, buflen = u64s * sizeof(u64); ++ ++ l->entry.u64s = cpu_to_le16(u64s); ++ l->entry.btree_id = 0; ++ l->entry.level = 0; ++ l->entry.type = BCH_JSET_ENTRY_log; ++ l->entry.pad[0] = 0; ++ l->entry.pad[1] = 0; ++ l->entry.pad[2] = 0; ++ b = min_t(unsigned, strlen(trans->fn), buflen); ++ memcpy(l->d, trans->fn, b); ++ while (b < buflen) ++ l->d[b++] = '\0'; ++ ++ trans->journal_res.offset += JSET_ENTRY_LOG_U64s; ++ trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s; ++} ++ +static inline enum btree_insert_ret +btree_key_can_insert(struct btree_trans *trans, + struct btree *b, @@ -25519,14 +26892,15 @@ index 000000000000..112ac7caf579 + struct btree_path *path, + unsigned u64s) +{ ++ struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; -+ unsigned new_u64s; ++ unsigned old_u64s = ck->u64s, new_u64s; + struct bkey_i *new_k; + + EBUG_ON(path->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && -+ bch2_btree_key_cache_must_wait(trans->c) && ++ bch2_btree_key_cache_must_wait(c) && + !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + @@ -25541,12 +26915,27 @@ index 000000000000..112ac7caf579 + + new_u64s = roundup_pow_of_two(u64s); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) ++ if (!new_k) { ++ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", ++ bch2_btree_ids[path->btree_id], new_u64s); + return -ENOMEM; ++ } + + ck->u64s = new_u64s; + ck->k = new_k; -+ return BTREE_INSERT_OK; ++ /* ++ * Keys returned by peek() are no longer valid pointers, so we need a ++ * transaction restart: ++ */ ++ trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_, ++ path->btree_id, &path->pos, ++ old_u64s, new_u64s); ++ /* ++ * Not using btree_trans_restart() because we can't unlock here, we have ++ * write locks held: ++ */ ++ trans->restarted = true; ++ return -EINTR; +} + +static inline void do_btree_insert_one(struct btree_trans *trans, @@ -25554,18 +26943,16 @@ index 000000000000..112ac7caf579 +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; -+ bool did_work; + + EBUG_ON(trans->journal_res.ref != + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); + + i->k->k.needs_whiteout = false; + -+ did_work = !i->cached -+ ? btree_insert_key_leaf(trans, i) -+ : bch2_btree_insert_key_cached(trans, i->path, i->k); -+ if (!did_work) -+ return; ++ if (!i->cached) ++ btree_insert_key_leaf(trans, i); ++ else ++ bch2_btree_insert_key_cached(trans, i->path, i->k); + + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + bch2_journal_add_keys(j, &trans->journal_res, @@ -25578,10 +26965,163 @@ index 000000000000..112ac7caf579 + } +} + -+static noinline void bch2_trans_mark_gc(struct btree_trans *trans) ++/* Triggers: */ ++ ++static int run_one_mem_trigger(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ unsigned flags) ++{ ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ struct bkey_i *new = i->k; ++ int ret; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(i->btree_id)) ++ return 0; ++ ++ if (bch2_bkey_ops[old.k->type].atomic_trigger == ++ bch2_bkey_ops[i->k->k.type].atomic_trigger && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); ++ } else { ++ struct bkey _deleted = KEY(0, 0, 0); ++ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; ++ ++ _deleted.p = i->path->pos; ++ ++ ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), ++ BTREE_TRIGGER_INSERT|flags) ?: ++ bch2_mark_key(trans, old, deleted, ++ BTREE_TRIGGER_OVERWRITE|flags); ++ } ++ ++ return ret; ++} ++ ++static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, ++ bool overwrite) ++{ ++ /* ++ * Transactional triggers create new btree_insert_entries, so we can't ++ * pass them a pointer to a btree_insert_entry, that memory is going to ++ * move: ++ */ ++ struct bkey old_k = i->old_k; ++ struct bkey_s_c old = { &old_k, i->old_v }; ++ ++ if ((i->flags & BTREE_TRIGGER_NORUN) || ++ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) ++ return 0; ++ ++ if (!i->insert_trigger_run && ++ !i->overwrite_trigger_run && ++ bch2_bkey_ops[old.k->type].trans_trigger == ++ bch2_bkey_ops[i->k->k.type].trans_trigger && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ i->overwrite_trigger_run = true; ++ i->insert_trigger_run = true; ++ return bch2_trans_mark_key(trans, old, i->k, ++ BTREE_TRIGGER_INSERT| ++ BTREE_TRIGGER_OVERWRITE| ++ i->flags) ?: 1; ++ } else if (overwrite && !i->overwrite_trigger_run) { ++ i->overwrite_trigger_run = true; ++ return bch2_trans_mark_old(trans, old, i->flags) ?: 1; ++ } else if (!i->insert_trigger_run) { ++ i->insert_trigger_run = true; ++ return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1; ++ } else { ++ return 0; ++ } ++} ++ ++static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, ++ struct btree_insert_entry *btree_id_start) ++{ ++ struct btree_insert_entry *i; ++ bool trans_trigger_run; ++ int ret, overwrite; ++ ++ for (overwrite = 1; overwrite >= 0; --overwrite) { ++ ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; ++ ++ for (i = btree_id_start; ++ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; ++ i++) { ++ if (i->btree_id != btree_id) ++ continue; ++ ++ ret = run_one_trans_trigger(trans, i, overwrite); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ trans_trigger_run = true; ++ } ++ } while (trans_trigger_run); ++ } ++ ++ return 0; ++} ++ ++static int bch2_trans_commit_run_triggers(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; ++ unsigned btree_id = 0; ++ int ret = 0; ++ ++ /* ++ * ++ * For a given btree, this algorithm runs insert triggers before ++ * overwrite triggers: this is so that when extents are being moved ++ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before ++ * they are re-added. ++ */ ++ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ if (btree_id == BTREE_ID_alloc) ++ continue; ++ ++ while (btree_id_start < trans->updates + trans->nr_updates && ++ btree_id_start->btree_id < btree_id) ++ btree_id_start++; ++ ++ ret = run_btree_triggers(trans, btree_id, btree_id_start); ++ if (ret) ++ return ret; ++ } ++ ++ trans_for_each_update(trans, i) { ++ if (i->btree_id > BTREE_ID_alloc) ++ break; ++ if (i->btree_id == BTREE_ID_alloc) { ++ ret = run_btree_triggers(trans, BTREE_ID_alloc, i); ++ if (ret) ++ return ret; ++ break; ++ } ++ } ++ ++ trans_for_each_update(trans, i) ++ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && ++ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && ++ (!i->insert_trigger_run || !i->overwrite_trigger_run)); ++ ++ return 0; ++} ++ ++static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; ++ int ret = 0; + + trans_for_each_update(trans, i) { + /* @@ -25590,10 +27130,14 @@ index 000000000000..112ac7caf579 + */ + BUG_ON(i->cached || i->level); + -+ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) -+ bch2_mark_update(trans, i->path, i->k, -+ i->flags|BTREE_TRIGGER_GC); ++ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { ++ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); ++ if (ret) ++ break; ++ } + } ++ ++ return ret; +} + +static inline int @@ -25609,7 +27153,7 @@ index 000000000000..112ac7caf579 + int ret; + + if (race_fault()) { -+ trace_trans_restart_fault_inject(trans->ip, trace_ip); ++ trace_trans_restart_fault_inject(trans->fn, trace_ip); + trans->restarted = true; + return -EINTR; + } @@ -25646,17 +27190,32 @@ index 000000000000..112ac7caf579 + + if (btree_node_type_needs_gc(i->bkey_type)) + marking = true; -+ } + -+ if (marking) { -+ percpu_down_read(&c->mark_lock); -+ } ++ /* ++ * Revalidate before calling mem triggers - XXX, ugly: ++ * ++ * - successful btree node splits don't cause transaction ++ * restarts and will have invalidated the pointer to the bkey ++ * value ++ * - btree_node_lock_for_insert() -> btree_node_prep_for_write() ++ * when it has to resort ++ * - btree_key_can_insert_cached() when it has to reallocate ++ * ++ * Ugly because we currently have no way to tell if the ++ * pointer's been invalidated, which means it's debatabale ++ * whether we should be stashing the old key at all. ++ */ ++ i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + -+ /* Must be called under mark_lock: */ -+ if (marking && trans->fs_usage_deltas && -+ !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) { -+ ret = BTREE_INSERT_NEED_MARK_REPLICAS; -+ goto err; ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { ++ struct bkey_i *j_k = ++ bch2_journal_keys_peek(c, i->btree_id, i->level, i->k->k.p); ++ ++ if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) { ++ i->old_k = j_k->k; ++ i->old_v = &j_k->v; ++ } ++ } + } + + /* @@ -25667,18 +27226,21 @@ index 000000000000..112ac7caf579 + ret = bch2_trans_journal_res_get(trans, + JOURNAL_RES_GET_NONBLOCK); + if (ret) -+ goto err; ++ return ret; ++ ++ if (unlikely(trans->journal_transaction_names)) ++ journal_transaction_name(trans); + } else { + trans->journal_res.seq = c->journal.replay_journal_seq; + } + -+ if (unlikely(trans->extra_journal_entry_u64s)) { ++ if (unlikely(trans->extra_journal_entries.nr)) { + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), -+ trans->extra_journal_entries, -+ trans->extra_journal_entry_u64s); ++ trans->extra_journal_entries.data, ++ trans->extra_journal_entries.nr); + -+ trans->journal_res.offset += trans->extra_journal_entry_u64s; -+ trans->journal_res.u64s -= trans->extra_journal_entry_u64s; ++ trans->journal_res.offset += trans->extra_journal_entries.nr; ++ trans->journal_res.u64s -= trans->extra_journal_entries.nr; + } + + /* @@ -25695,22 +27257,25 @@ index 000000000000..112ac7caf579 + i->k->k.version = MAX_VERSION; + } + ++ if (trans->fs_usage_deltas && ++ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) ++ return BTREE_INSERT_NEED_MARK_REPLICAS; ++ + trans_for_each_update(trans, i) -+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) -+ bch2_mark_update(trans, i->path, i->k, i->flags); ++ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { ++ ret = run_one_mem_trigger(trans, i, i->flags); ++ if (ret) ++ return ret; ++ } + -+ if (marking && trans->fs_usage_deltas) -+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); -+ -+ if (unlikely(c->gc_pos.phase)) -+ bch2_trans_mark_gc(trans); ++ if (unlikely(c->gc_pos.phase)) { ++ ret = bch2_trans_commit_run_gc_triggers(trans); ++ if (ret) ++ return ret; ++ } + + trans_for_each_update(trans, i) + do_btree_insert_one(trans, i); -+err: -+ if (marking) { -+ percpu_up_read(&c->mark_lock); -+ } + + return ret; +} @@ -25797,8 +27362,10 @@ index 000000000000..112ac7caf579 + if (have_conflicting_read_lock(trans, i->path)) + goto fail; + -+ __btree_node_lock_type(trans->c, insert_l(i)->b, -+ SIX_LOCK_write); ++ btree_node_lock_type(trans, i->path, ++ insert_l(i)->b, ++ i->path->pos, i->level, ++ SIX_LOCK_write, NULL, NULL); + } + + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); @@ -25813,10 +27380,18 @@ index 000000000000..112ac7caf579 + bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); + } + -+ trace_trans_restart_would_deadlock_write(trans->ip); ++ trace_trans_restart_would_deadlock_write(trans->fn); + return btree_trans_restart(trans); +} + ++static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); ++} ++ +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ @@ -25826,42 +27401,29 @@ index 000000000000..112ac7caf579 +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; -+ struct bkey_s_c old; + int ret, u64s_delta = 0; + + trans_for_each_update(trans, i) { + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); -+ bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n", -+ buf, (void *) trans->ip, -+ (void *) i->ip_allocated, invalid); -+ bch2_fatal_error(c); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); ++ bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n", ++ buf.buf, trans->fn, (void *) i->ip_allocated, invalid); ++ printbuf_exit(&buf); + return -EINVAL; + } + btree_insert_entry_checks(trans, i); + } + + trans_for_each_update(trans, i) { -+ struct bkey u; -+ -+ /* -+ * peek_slot() doesn't yet work on iterators that point to -+ * interior nodes: -+ */ -+ if (i->cached || i->level) ++ if (i->cached) + continue; + -+ old = bch2_btree_path_peek_slot(i->path, &u); -+ ret = bkey_err(old); -+ if (unlikely(ret)) -+ return ret; -+ + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; -+ u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; ++ u64s_delta -= i->old_btree_u64s; + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { @@ -25878,8 +27440,7 @@ index 000000000000..112ac7caf579 + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, trans->journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK| -+ ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED) -+ ? JOURNAL_RES_GET_RESERVED : 0)); ++ (trans->flags & JOURNAL_WATERMARK_MASK)); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, + trans->journal_preres_u64s, trace_ip); @@ -25894,6 +27455,9 @@ index 000000000000..112ac7caf579 + + ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + ++ if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) ++ bch2_drop_overwrites_from_journal(trans); ++ + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, @@ -25941,7 +27505,7 @@ index 000000000000..112ac7caf579 + return 0; + + if (ret == -EINTR) -+ trace_trans_restart_btree_node_split(trans->ip, trace_ip, ++ trace_trans_restart_btree_node_split(trans->fn, trace_ip, + i->btree_id, &i->path->pos); + break; + case BTREE_INSERT_NEED_MARK_REPLICAS: @@ -25954,14 +27518,14 @@ index 000000000000..112ac7caf579 + if (bch2_trans_relock(trans)) + return 0; + -+ trace_trans_restart_mark_replicas(trans->ip, trace_ip); ++ trace_trans_restart_mark_replicas(trans->fn, trace_ip); + ret = -EINTR; + break; + case BTREE_INSERT_NEED_JOURNAL_RES: + bch2_trans_unlock(trans); + + if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && -+ !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) { ++ !(trans->flags & JOURNAL_WATERMARK_reserved)) { + trans->restarted = true; + ret = -EAGAIN; + break; @@ -25974,13 +27538,13 @@ index 000000000000..112ac7caf579 + if (bch2_trans_relock(trans)) + return 0; + -+ trace_trans_restart_journal_res_get(trans->ip, trace_ip); ++ trace_trans_restart_journal_res_get(trans->fn, trace_ip); + ret = -EINTR; + break; + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); + -+ trace_trans_blocked_journal_reclaim(trans->ip, trace_ip); ++ trace_trans_blocked_journal_reclaim(trans->fn, trace_ip); + + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); @@ -25990,7 +27554,7 @@ index 000000000000..112ac7caf579 + if (bch2_trans_relock(trans)) + return 0; + -+ trace_trans_restart_journal_reclaim(trans->ip, trace_ip); ++ trace_trans_restart_journal_reclaim(trans->fn, trace_ip); + ret = -EINTR; + break; + default: @@ -25999,7 +27563,9 @@ index 000000000000..112ac7caf579 + } + + BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); -+ BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL)); ++ BUG_ON(ret == -ENOSPC && ++ !(trans->flags & BTREE_INSERT_NOWAIT) && ++ (trans->flags & BTREE_INSERT_NOFAIL)); + + return ret; +} @@ -26010,7 +27576,8 @@ index 000000000000..112ac7caf579 + struct bch_fs *c = trans->c; + int ret; + -+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) ++ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) || ++ test_bit(BCH_FS_STARTED, &c->flags)) + return -EROFS; + + bch2_trans_unlock(trans); @@ -26026,155 +27593,72 @@ index 000000000000..112ac7caf579 + return 0; +} + -+static int bch2_trans_commit_run_triggers(struct btree_trans *trans) ++/* ++ * This is for updates done in the early part of fsck - btree_gc - before we've ++ * gone RW. we only add the new key to the list of keys for journal replay to ++ * do. ++ */ ++static noinline int ++do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) +{ -+ struct bkey _deleted = KEY(0, 0, 0); -+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; -+ struct bkey_s_c old; -+ struct bkey unpacked; -+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; -+ bool trans_trigger_run; -+ unsigned btree_id = 0; ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; + int ret = 0; + -+ /* -+ * -+ * For a given btree, this algorithm runs insert triggers before -+ * overwrite triggers: this is so that when extents are being moved -+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before -+ * they are re-added. -+ */ -+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { -+ while (btree_id_start < trans->updates + trans->nr_updates && -+ btree_id_start->btree_id < btree_id) -+ btree_id_start++; -+ -+ /* -+ * Running triggers will append more updates to the list of updates as -+ * we're walking it: -+ */ -+ do { -+ trans_trigger_run = false; -+ -+ for (i = btree_id_start; -+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; -+ i++) { -+ if (i->insert_trigger_run || -+ (i->flags & BTREE_TRIGGER_NORUN) || -+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) -+ continue; -+ -+ BUG_ON(i->overwrite_trigger_run); -+ -+ i->insert_trigger_run = true; -+ trans_trigger_run = true; -+ -+ old = bch2_btree_path_peek_slot(i->path, &unpacked); -+ _deleted.p = i->path->pos; -+ -+ if (old.k->type == i->k->k.type && -+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { -+ i->overwrite_trigger_run = true; -+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), -+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); -+ } else { -+ ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), -+ BTREE_TRIGGER_INSERT|i->flags); -+ } -+ -+ if (ret == -EINTR) -+ trace_trans_restart_mark(trans->ip, _RET_IP_, -+ i->btree_id, &i->path->pos); -+ if (ret) -+ return ret; -+ } -+ } while (trans_trigger_run); -+ -+ do { -+ trans_trigger_run = false; -+ -+ for (i = btree_id_start; -+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; -+ i++) { -+ if (i->overwrite_trigger_run || -+ (i->flags & BTREE_TRIGGER_NORUN) || -+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) -+ continue; -+ -+ BUG_ON(!i->insert_trigger_run); -+ -+ i->overwrite_trigger_run = true; -+ trans_trigger_run = true; -+ -+ old = bch2_btree_path_peek_slot(i->path, &unpacked); -+ _deleted.p = i->path->pos; -+ -+ ret = bch2_trans_mark_key(trans, old, deleted, -+ BTREE_TRIGGER_OVERWRITE|i->flags); -+ -+ if (ret == -EINTR) -+ trace_trans_restart_mark(trans->ip, _RET_IP_, -+ i->btree_id, &i->path->pos); -+ if (ret) -+ return ret; -+ } -+ } while (trans_trigger_run); ++ trans_for_each_update(trans, i) { ++ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); ++ if (ret) ++ break; + } + -+ trans_for_each_update(trans, i) -+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && -+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && -+ (!i->insert_trigger_run || !i->overwrite_trigger_run)); -+ -+ return 0; ++ return ret; +} + +int __bch2_trans_commit(struct btree_trans *trans) +{ ++ struct bch_fs *c = trans->c; + struct btree_insert_entry *i = NULL; + unsigned u64s; + int ret = 0; + + if (!trans->nr_updates && -+ !trans->extra_journal_entry_u64s) ++ !trans->extra_journal_entries.nr) + goto out_reset; + + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) -+ lockdep_assert_held(&trans->c->gc_lock); ++ lockdep_assert_held(&c->gc_lock); + -+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); ++ ret = bch2_trans_commit_run_triggers(trans); ++ if (ret) ++ goto out_reset; + -+ trans->journal_u64s = trans->extra_journal_entry_u64s; -+ trans->journal_preres_u64s = 0; ++ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { ++ ret = do_bch2_trans_commit_to_journal_replay(trans); ++ goto out_reset; ++ } + + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && -+ unlikely(!percpu_ref_tryget(&trans->c->writes))) { ++ unlikely(!percpu_ref_tryget(&c->writes))) { + ret = bch2_trans_commit_get_rw_cold(trans); + if (ret) + goto out_reset; + } + -+#ifdef CONFIG_BCACHEFS_DEBUG -+ /* -+ * if BTREE_TRIGGER_NORUN is set, it means we're probably being called -+ * from the key cache flush code: -+ */ -+ trans_for_each_update(trans, i) -+ if (!i->cached && -+ !(i->flags & BTREE_TRIGGER_NORUN)) -+ bch2_btree_key_cache_verify_clean(trans, -+ i->btree_id, i->k->k.p); -+#endif ++ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + -+ ret = bch2_trans_commit_run_triggers(trans); -+ if (ret) -+ goto out; ++ trans->journal_u64s = trans->extra_journal_entries.nr; ++ trans->journal_preres_u64s = 0; ++ ++ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); ++ ++ if (trans->journal_transaction_names) ++ trans->journal_u64s += JSET_ENTRY_LOG_U64s; + + trans_for_each_update(trans, i) { + BUG_ON(!i->path->should_be_locked); + + if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { -+ trace_trans_restart_upgrade(trans->ip, _RET_IP_, ++ trace_trans_restart_upgrade(trans->fn, _RET_IP_, + i->btree_id, &i->path->pos); + ret = btree_trans_restart(trans); + goto out; @@ -26190,7 +27674,7 @@ index 000000000000..112ac7caf579 + } + + if (trans->extra_journal_res) { -+ ret = bch2_disk_reservation_add(trans->c, trans->disk_res, ++ ret = bch2_disk_reservation_add(c, trans->disk_res, + trans->extra_journal_res, + (trans->flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0); @@ -26209,10 +27693,10 @@ index 000000000000..112ac7caf579 + if (ret) + goto err; +out: -+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ bch2_journal_preres_put(&c->journal, &trans->journal_preres); + + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) -+ percpu_ref_put(&trans->c->writes); ++ percpu_ref_put(&c->writes); +out_reset: + trans_for_each_update(trans, i) + bch2_path_put(trans, i->path, true); @@ -26220,8 +27704,7 @@ index 000000000000..112ac7caf579 + trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->hooks = NULL; -+ trans->extra_journal_entries = NULL; -+ trans->extra_journal_entry_u64s = 0; ++ trans->extra_journal_entries.nr = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; @@ -26248,6 +27731,9 @@ index 000000000000..112ac7caf579 + struct bkey_s_c k; + int ret; + ++ if (!btree_type_has_snapshots(id)) ++ return 0; ++ + if (!snapshot_t(c, pos.snapshot)->children[0]) + return 0; + @@ -26276,10 +27762,10 @@ index 000000000000..112ac7caf579 + return ret; +} + -+static int bch2_trans_update_extent(struct btree_trans *trans, -+ struct btree_iter *orig_iter, -+ struct bkey_i *insert, -+ enum btree_update_flags flags) ++int bch2_trans_update_extent(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert, ++ enum btree_update_flags flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; @@ -26293,7 +27779,7 @@ index 000000000000..112ac7caf579 + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); -+ k = bch2_btree_iter_peek(&iter); ++ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) @@ -26437,19 +27923,16 @@ index 000000000000..112ac7caf579 + bkey_reassemble(update, k); + bch2_cut_front(insert->k.p, update); + -+ bch2_trans_copy_iter(&update_iter, &iter); -+ update_iter.pos = update->k.p; -+ ret = bch2_trans_update(trans, &update_iter, update, ++ ret = bch2_trans_update_by_path(trans, iter.path, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); -+ bch2_trans_iter_exit(trans, &update_iter); -+ + if (ret) + goto err; + goto out; + } +next: -+ k = bch2_btree_iter_next(&iter); ++ bch2_btree_iter_advance(&iter); ++ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) @@ -26510,7 +27993,8 @@ index 000000000000..112ac7caf579 + pos.snapshot++; + + for_each_btree_key_norestart(trans, iter, btree_id, pos, -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_NOPRESERVE, k, ret) { + if (bkey_cmp(k.k->p, pos)) + break; + @@ -26525,48 +28009,35 @@ index 000000000000..112ac7caf579 + return ret; +} + -+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_i *k, enum btree_update_flags flags) ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, ++ struct bkey_i *k, enum btree_update_flags flags) +{ ++ struct bch_fs *c = trans->c; + struct btree_insert_entry *i, n; + -+ BUG_ON(!iter->path->should_be_locked); -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ return bch2_trans_update_extent(trans, iter, k, flags); ++ BUG_ON(!path->should_be_locked); + + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); -+ BUG_ON(bpos_cmp(k->k.p, iter->path->pos)); ++ BUG_ON(bpos_cmp(k->k.p, path->pos)); + + n = (struct btree_insert_entry) { + .flags = flags, -+ .bkey_type = __btree_node_type(iter->path->level, iter->btree_id), -+ .btree_id = iter->btree_id, -+ .level = iter->path->level, -+ .cached = iter->flags & BTREE_ITER_CACHED, -+ .path = iter->path, ++ .bkey_type = __btree_node_type(path->level, path->btree_id), ++ .btree_id = path->btree_id, ++ .level = path->level, ++ .cached = path->cached, ++ .path = path, + .k = k, + .ip_allocated = _RET_IP_, + }; + -+ __btree_path_get(n.path, true); -+ +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); +#endif + -+ if (bkey_deleted(&n.k->k) && -+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { -+ int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); -+ if (unlikely(ret < 0)) -+ return ret; -+ -+ if (ret) -+ n.k->k.type = KEY_TYPE_whiteout; -+ } -+ + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: @@ -26579,27 +28050,95 @@ index 000000000000..112ac7caf579 + !btree_insert_entry_cmp(&n, i)) { + BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + -+ /* -+ * This is a hack to ensure that inode creates update the btree, -+ * not the key cache, which helps with cache coherency issues in -+ * other areas: -+ */ -+ if (n.cached && !i->cached) { -+ i->k = n.k; -+ i->flags = n.flags; -+ -+ __btree_path_get(n.path, false); -+ } else { -+ bch2_path_put(trans, i->path, true); -+ *i = n; -+ } -+ } else ++ bch2_path_put(trans, i->path, true); ++ i->flags = n.flags; ++ i->cached = n.cached; ++ i->k = n.k; ++ i->path = n.path; ++ i->ip_allocated = n.ip_allocated; ++ } else { + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + ++ i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v; ++ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; ++ ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { ++ struct bkey_i *j_k = ++ bch2_journal_keys_peek(c, n.btree_id, n.level, k->k.p); ++ ++ if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) { ++ i->old_k = j_k->k; ++ i->old_v = &j_k->v; ++ } ++ } ++ } ++ ++ __btree_path_get(n.path, true); + return 0; +} + ++int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_update_flags flags) ++{ ++ struct btree_path *path = iter->update_path ?: iter->path; ++ struct bkey_cached *ck; ++ int ret; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return bch2_trans_update_extent(trans, iter, k, flags); ++ ++ if (bkey_deleted(&k->k) && ++ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && ++ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { ++ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ if (ret) ++ k->k.type = KEY_TYPE_whiteout; ++ } ++ ++ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && ++ !path->cached && ++ !path->level && ++ btree_id_cached(trans->c, path->btree_id)) { ++ if (!iter->key_cache_path || ++ !iter->key_cache_path->should_be_locked || ++ bpos_cmp(iter->key_cache_path->pos, k->k.p)) { ++ if (!iter->key_cache_path) ++ iter->key_cache_path = ++ bch2_path_get(trans, path->btree_id, path->pos, 1, 0, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_CACHED, _THIS_IP_); ++ ++ iter->key_cache_path = ++ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, ++ iter->flags & BTREE_ITER_INTENT, ++ _THIS_IP_); ++ ++ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, ++ BTREE_ITER_CACHED); ++ if (unlikely(ret)) ++ return ret; ++ ++ ck = (void *) iter->key_cache_path->l[0].b; ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_); ++ btree_trans_restart(trans); ++ return -EINTR; ++ } ++ ++ iter->key_cache_path->should_be_locked = true; ++ } ++ ++ path = iter->key_cache_path; ++ } ++ ++ return bch2_trans_update_by_path(trans, path, k, flags); ++} ++ +void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ @@ -26653,19 +28192,21 @@ index 000000000000..112ac7caf579 + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, -+ unsigned iter_flags, ++ unsigned update_flags, + u64 *journal_seq) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + -+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags); ++ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); +retry: + while ((bch2_trans_begin(trans), + (k = bch2_btree_iter_peek(&iter)).k) && + !(ret = bkey_err(k)) && + bkey_cmp(iter.pos, end) < 0) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(trans->c, 0); + struct bkey_i delete; + + bkey_init(&delete.k); @@ -26686,7 +28227,7 @@ index 000000000000..112ac7caf579 + */ + delete.k.p = iter.pos; + -+ if (btree_node_type_is_extents(id)) { ++ if (iter.flags & BTREE_ITER_IS_EXTENTS) { + unsigned max_sectors = + KEY_SIZE_MAX & (~0 << trans->c->block_bits); + @@ -26700,8 +28241,10 @@ index 000000000000..112ac7caf579 + } + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: -+ bch2_trans_commit(trans, NULL, journal_seq, -+ BTREE_INSERT_NOFAIL); ++ bch2_trans_commit(trans, &disk_res, journal_seq, ++ BTREE_INSERT_NOFAIL| ++ update_flags); ++ bch2_disk_reservation_put(trans->c, &disk_res); + if (ret) + break; + } @@ -26722,17 +28265,46 @@ index 000000000000..112ac7caf579 + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, ++ unsigned update_flags, + u64 *journal_seq) +{ + return bch2_trans_do(c, NULL, journal_seq, 0, -+ bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq)); ++ bch2_btree_delete_range_trans(&trans, id, start, end, ++ update_flags, journal_seq)); ++} ++ ++int bch2_trans_log_msg(struct btree_trans *trans, const char *msg) ++{ ++ unsigned len = strlen(msg); ++ unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); ++ struct jset_entry_log *l; ++ int ret; ++ ++ ret = darray_make_room(trans->extra_journal_entries, jset_u64s(u64s)); ++ if (ret) ++ return ret; ++ ++ l = (void *) &darray_top(trans->extra_journal_entries); ++ l->entry.u64s = cpu_to_le16(u64s); ++ l->entry.btree_id = 0; ++ l->entry.level = 1; ++ l->entry.type = BCH_JSET_ENTRY_log; ++ l->entry.pad[0] = 0; ++ l->entry.pad[1] = 0; ++ l->entry.pad[2] = 0; ++ memcpy(l->d, msg, len); ++ while (len & 7) ++ l->d[len++] = '\0'; ++ ++ trans->extra_journal_entries.nr += jset_u64s(u64s); ++ return 0; +} diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 -index 000000000000..6fc93b56bcb2 +index 000000000000..7654ab24a909 --- /dev/null +++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2227 @@ +@@ -0,0 +1,2122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. @@ -26746,6 +28318,7 @@ index 000000000000..6fc93b56bcb2 +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" ++#include "buckets_waiting_for_journal.h" +#include "ec.h" +#include "error.h" +#include "inode.h" @@ -26778,43 +28351,6 @@ index 000000000000..6fc93b56bcb2 + } +} + -+/* -+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent -+ * wraparound: -+ */ -+void bch2_bucket_seq_cleanup(struct bch_fs *c) -+{ -+ u64 journal_seq = atomic64_read(&c->journal.seq); -+ u16 last_seq_ondisk = c->journal.last_seq_ondisk; -+ struct bch_dev *ca; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ struct bucket_mark m; -+ unsigned i; -+ -+ if (journal_seq - c->last_bucket_seq_cleanup < -+ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) -+ return; -+ -+ c->last_bucket_seq_cleanup = journal_seq; -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) { -+ bucket_cmpxchg(g, m, ({ -+ if (!m.journal_seq_valid || -+ bucket_needs_journal_commit(m, last_seq_ondisk)) -+ break; -+ -+ m.journal_seq_valid = 0; -+ })); -+ } -+ up_read(&ca->bucket_lock); -+ } -+} -+ +void bch2_fs_usage_initialize(struct bch_fs *c) +{ + struct bch_fs_usage *usage; @@ -26879,6 +28415,7 @@ index 000000000000..6fc93b56bcb2 + unsigned journal_seq, + bool gc) +{ ++ percpu_rwsem_assert_held(&c->mark_lock); + BUG_ON(!gc && !journal_seq); + + return this_cpu_ptr(gc @@ -27049,36 +28586,24 @@ index 000000000000..6fc93b56bcb2 + return ret; +} + -+static inline int is_unavailable_bucket(struct bucket_mark m) ++static inline int is_unavailable_bucket(struct bch_alloc_v4 a) +{ -+ return !is_available_bucket(m); ++ return a.dirty_sectors || a.stripe; +} + +static inline int bucket_sectors_fragmented(struct bch_dev *ca, -+ struct bucket_mark m) ++ struct bch_alloc_v4 a) +{ -+ return bucket_sectors_used(m) -+ ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) ++ return a.dirty_sectors ++ ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) + : 0; +} + -+static inline int is_stripe_data_bucket(struct bucket_mark m) ++static inline enum bch_data_type bucket_type(struct bch_alloc_v4 a) +{ -+ return m.stripe && m.data_type != BCH_DATA_parity; -+} -+ -+static inline enum bch_data_type bucket_type(struct bucket_mark m) -+{ -+ return m.cached_sectors && !m.dirty_sectors ++ return a.cached_sectors && !a.dirty_sectors + ? BCH_DATA_cached -+ : m.data_type; -+} -+ -+static bool bucket_became_unavailable(struct bucket_mark old, -+ struct bucket_mark new) -+{ -+ return is_available_bucket(old) && -+ !is_available_bucket(new); ++ : a.data_type; +} + +static inline void account_bucket(struct bch_fs_usage *fs_usage, @@ -27093,21 +28618,13 @@ index 000000000000..6fc93b56bcb2 +} + +static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, -+ struct bucket_mark old, struct bucket_mark new, ++ struct bch_alloc_v4 old, ++ struct bch_alloc_v4 new, + u64 journal_seq, bool gc) +{ + struct bch_fs_usage *fs_usage; + struct bch_dev_usage *u; + -+ /* -+ * Hack for bch2_fs_initialize path, where we're first marking sb and -+ * journal non-transactionally: -+ */ -+ if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags)) -+ journal_seq = 1; -+ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + u = dev_usage_ptr(ca, journal_seq, gc); @@ -27133,9 +28650,28 @@ index 000000000000..6fc93b56bcb2 + u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + + preempt_enable(); ++} + -+ if (!is_available_bucket(old) && is_available_bucket(new)) -+ bch2_wake_allocator(ca); ++static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, ++ struct bucket old, struct bucket new, ++ u64 journal_seq, bool gc) ++{ ++ struct bch_alloc_v4 old_a = { ++ .gen = old.gen, ++ .data_type = old.data_type, ++ .dirty_sectors = old.dirty_sectors, ++ .cached_sectors = old.cached_sectors, ++ .stripe = old.stripe, ++ }; ++ struct bch_alloc_v4 new_a = { ++ .gen = new.gen, ++ .data_type = new.data_type, ++ .dirty_sectors = new.dirty_sectors, ++ .cached_sectors = new.cached_sectors, ++ .stripe = new.stripe, ++ }; ++ ++ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); +} + +static inline int __update_replicas(struct bch_fs *c, @@ -27153,25 +28689,50 @@ index 000000000000..6fc93b56bcb2 + return 0; +} + -+static inline int update_replicas(struct bch_fs *c, ++static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, + struct bch_replicas_entry *r, s64 sectors, + unsigned journal_seq, bool gc) +{ + struct bch_fs_usage __percpu *fs_usage; -+ int idx = bch2_replicas_entry_idx(c, r); ++ int idx, ret = 0; ++ struct printbuf buf = PRINTBUF; + -+ if (idx < 0) -+ return -1; ++ percpu_down_read(&c->mark_lock); ++ buf.atomic++; ++ ++ idx = bch2_replicas_entry_idx(c, r); ++ if (idx < 0 && ++ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err(c, "no replicas entry\n" ++ " while marking %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { ++ percpu_up_read(&c->mark_lock); ++ ret = bch2_mark_replicas(c, r); ++ percpu_down_read(&c->mark_lock); ++ ++ if (ret) ++ goto err; ++ idx = bch2_replicas_entry_idx(c, r); ++ } ++ if (idx < 0) { ++ ret = -1; ++ goto err; ++ } + + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; + preempt_enable(); -+ return 0; ++err: ++fsck_err: ++ percpu_up_read(&c->mark_lock); ++ printbuf_exit(&buf); ++ return ret; +} + +static inline int update_cached_sectors(struct bch_fs *c, ++ struct bkey_s_c k, + unsigned dev, s64 sectors, + unsigned journal_seq, bool gc) +{ @@ -27179,7 +28740,7 @@ index 000000000000..6fc93b56bcb2 + + bch2_replicas_entry_cached(&r.e, dev); + -+ return update_replicas(c, &r.e, sectors, journal_seq, gc); ++ return update_replicas(c, k, &r.e, sectors, journal_seq, gc); +} + +static struct replicas_delta_list * @@ -27245,47 +28806,21 @@ index 000000000000..6fc93b56bcb2 + update_replicas_list(trans, &r.e, sectors); +} + -+#define do_mark_fn(fn, c, pos, flags, ...) \ -+({ \ -+ int gc, ret = 0; \ -+ \ -+ percpu_rwsem_assert_held(&c->mark_lock); \ -+ \ -+ for (gc = 0; gc < 2 && !ret; gc++) \ -+ if (!gc == !(flags & BTREE_TRIGGER_GC) || \ -+ (gc && gc_visited(c, pos))) \ -+ ret = fn(c, __VA_ARGS__, gc); \ -+ ret; \ -+}) -+ -+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, bool owned_by_allocator) -+{ -+ struct bucket *g = bucket(ca, b); -+ struct bucket_mark old, new; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.owned_by_allocator = owned_by_allocator; -+ })); -+ -+ BUG_ON(owned_by_allocator == old.owned_by_allocator); -+} -+ -+static int bch2_mark_alloc(struct btree_trans *trans, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++int bch2_mark_alloc(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; -+ struct bkey_alloc_unpacked u; -+ struct bch_dev *ca; -+ struct bucket *g; -+ struct bucket_mark old_m, m; ++ struct bch_alloc_v4 old_a, new_a; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); ++ int ret = 0; + -+ /* We don't do anything for deletions - do we?: */ -+ if (!bkey_is_alloc(new.k)) -+ return 0; ++ if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket || ++ new.k->p.offset >= ca->mi.nbuckets, trans, ++ "alloc key outside range of device's buckets")) ++ return -EIO; + + /* + * alloc btree is read in by bch2_alloc_read, not gc: @@ -27294,44 +28829,81 @@ index 000000000000..6fc93b56bcb2 + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + -+ if (flags & BTREE_TRIGGER_INSERT) { -+ struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; ++ bch2_alloc_to_v4(old, &old_a); ++ bch2_alloc_to_v4(new, &new_a); ++ ++ if ((flags & BTREE_TRIGGER_INSERT) && ++ !old_a.data_type != !new_a.data_type && ++ new.k->type == KEY_TYPE_alloc_v4) { ++ struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; + + BUG_ON(!journal_seq); -+ BUG_ON(new.k->type != KEY_TYPE_alloc_v3); + -+ v->journal_seq = cpu_to_le64(journal_seq); ++ /* ++ * If the btree updates referring to a bucket weren't flushed ++ * before the bucket became empty again, then the we don't have ++ * to wait on a journal flush before we can reuse the bucket: ++ */ ++ new_a.journal_seq = !new_a.data_type && ++ (journal_seq == v->journal_seq || ++ bch2_journal_noflush_seq(&c->journal, v->journal_seq)) ++ ? 0 : journal_seq; ++ v->journal_seq = new_a.journal_seq; + } + -+ ca = bch_dev_bkey_exists(c, new.k->p.inode); -+ -+ if (new.k->p.offset >= ca->mi.nbuckets) -+ return 0; -+ -+ g = __bucket(ca, new.k->p.offset, gc); -+ u = bch2_alloc_unpack(new); -+ -+ old_m = bucket_cmpxchg(g, m, ({ -+ m.gen = u.gen; -+ m.data_type = u.data_type; -+ m.dirty_sectors = u.dirty_sectors; -+ m.cached_sectors = u.cached_sectors; -+ m.stripe = u.stripe != 0; -+ -+ if (journal_seq) { -+ m.journal_seq_valid = 1; -+ m.journal_seq = journal_seq; ++ if (old_a.data_type && !new_a.data_type && new_a.journal_seq) { ++ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ new.k->p.inode, new.k->p.offset, ++ new_a.journal_seq); ++ if (ret) { ++ bch2_fs_fatal_error(c, ++ "error setting bucket_needs_journal_commit: %i", ret); ++ return ret; + } -+ })); ++ } + -+ bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); ++ if (!new_a.data_type && ++ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) ++ closure_wake_up(&c->freelist_wait); + -+ g->io_time[READ] = u.read_time; -+ g->io_time[WRITE] = u.write_time; -+ g->oldest_gen = u.oldest_gen; -+ g->gen_valid = 1; -+ g->stripe = u.stripe; -+ g->stripe_redundancy = u.stripe_redundancy; ++ if ((flags & BTREE_TRIGGER_INSERT) && ++ BCH_ALLOC_V4_NEED_DISCARD(&new_a) && ++ !new_a.journal_seq) ++ bch2_do_discards(c); ++ ++ if (!old_a.data_type && ++ new_a.data_type && ++ should_invalidate_buckets(ca)) ++ bch2_do_invalidates(c); ++ ++ if (bucket_state(new_a) == BUCKET_need_gc_gens) { ++ atomic_inc(&c->kick_gc); ++ wake_up_process(c->gc_thread); ++ } ++ ++ percpu_down_read(&c->mark_lock); ++ if (!gc && new_a.gen != old_a.gen) ++ *bucket_gen(ca, new.k->p.offset) = new_a.gen; ++ ++ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); ++ ++ if (gc) { ++ struct bucket *g = gc_bucket(ca, new.k->p.offset); ++ ++ bucket_lock(g); ++ ++ g->gen_valid = 1; ++ g->gen = new_a.gen; ++ g->data_type = new_a.data_type; ++ g->stripe = new_a.stripe; ++ g->stripe_redundancy = new_a.stripe_redundancy; ++ g->dirty_sectors = new_a.dirty_sectors; ++ g->cached_sectors = new_a.cached_sectors; ++ ++ bucket_unlock(g); ++ } ++ percpu_up_read(&c->mark_lock); + + /* + * need to know if we're getting called from the invalidate path or @@ -27339,45 +28911,52 @@ index 000000000000..6fc93b56bcb2 + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && -+ old_m.cached_sectors) { -+ if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors, -+ journal_seq, gc)) { ++ old_a.cached_sectors) { ++ ret = update_cached_sectors(c, new, ca->dev_idx, ++ -old_a.cached_sectors, ++ journal_seq, gc); ++ if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); -+ return -1; ++ return ret; + } + + trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), -+ old_m.cached_sectors); ++ old_a.cached_sectors); + } + + return 0; +} + -+#define checked_add(a, b) \ -+({ \ -+ unsigned _res = (unsigned) (a) + (b); \ -+ bool overflow = _res > U16_MAX; \ -+ if (overflow) \ -+ _res = U16_MAX; \ -+ (a) = _res; \ -+ overflow; \ -+}) -+ -+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type data_type, -+ unsigned sectors, bool gc) ++void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type data_type, ++ unsigned sectors, struct gc_pos pos, ++ unsigned flags) +{ -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; ++ struct bucket old, new, *g; + bool overflow; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); + BUG_ON(data_type != BCH_DATA_sb && + data_type != BCH_DATA_journal); + -+ old = bucket_cmpxchg(g, new, ({ -+ new.data_type = data_type; -+ overflow = checked_add(new.dirty_sectors, sectors); -+ })); ++ /* ++ * Backup superblock might be past the end of our normal usable space: ++ */ ++ if (b >= ca->mi.nbuckets) ++ return; ++ ++ percpu_down_read(&c->mark_lock); ++ g = gc_bucket(ca, b); ++ ++ bucket_lock(g); ++ old = *g; ++ ++ g->data_type = data_type; ++ g->dirty_sectors += sectors; ++ overflow = g->dirty_sectors < sectors; ++ ++ new = *g; ++ bucket_unlock(g); + + bch2_fs_inconsistent_on(old.data_type && + old.data_type != data_type, c, @@ -27391,32 +28970,8 @@ index 000000000000..6fc93b56bcb2 + bch2_data_types[old.data_type ?: data_type], + old.dirty_sectors, sectors); + -+ if (c) -+ bch2_dev_usage_update(c, ca, old, new, 0, gc); -+ -+ return 0; -+} -+ -+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type type, -+ unsigned sectors, struct gc_pos pos, -+ unsigned flags) -+{ -+ BUG_ON(type != BCH_DATA_sb && -+ type != BCH_DATA_journal); -+ -+ /* -+ * Backup superblock might be past the end of our normal usable space: -+ */ -+ if (b >= ca->mi.nbuckets) -+ return; -+ -+ if (likely(c)) { -+ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, -+ ca, b, type, sectors); -+ } else { -+ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); -+ } ++ bch2_dev_usage_update_m(c, ca, old, new, 0, true); ++ percpu_up_read(&c->mark_lock); +} + +static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) @@ -27434,124 +28989,154 @@ index 000000000000..6fc93b56bcb2 + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, -+ u8 bucket_gen, u8 bucket_data_type, -+ u16 dirty_sectors, u16 cached_sectors) ++ u8 b_gen, u8 bucket_data_type, ++ u32 dirty_sectors, u32 cached_sectors) +{ -+ size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); + u16 bucket_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; + -+ if (gen_after(ptr->gen, bucket_gen)) { ++ if (gen_after(ptr->gen, b_gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" + "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } + -+ if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { ++ if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } + -+ if (bucket_gen != ptr->gen && !ptr->cached) { ++ if (b_gen != ptr->gen && !ptr->cached) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" ++ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" + "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, ++ *bucket_gen(ca, bucket_nr), + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } + -+ if (bucket_gen != ptr->gen) -+ return 1; ++ if (b_gen != ptr->gen) { ++ ret = 1; ++ goto err; ++ } + + if (bucket_data_type && ptr_data_type && + bucket_data_type != ptr_data_type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type], + bch2_data_types[ptr_data_type], -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } + -+ if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { ++ if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" + "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, ++ ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + bucket_sectors, sectors, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; + } -+ -+ return 0; ++err: ++ printbuf_exit(&buf); ++ return ret; +} + +static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, -+ u64 journal_seq, unsigned flags) ++ unsigned flags) +{ + struct bch_fs *c = trans->c; ++ u64 journal_seq = trans->journal_res.seq; + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; ++ enum bch_data_type data_type = parity ? BCH_DATA_parity : 0; ++ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; -+ bool gc = flags & BTREE_TRIGGER_GC; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, gc); -+ struct bucket_mark new, old; -+ char buf[200]; -+ int ret; ++ struct bucket old, new, *g; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; + -+ if (g->stripe && g->stripe != k.k->p.offset) { ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ ++ /* * XXX doesn't handle deletion */ ++ ++ percpu_down_read(&c->mark_lock); ++ buf.atomic++; ++ g = PTR_GC_BUCKET(ca, ptr); ++ ++ if (g->dirty_sectors || ++ (g->stripe && g->stripe != k.k->p.offset)) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EINVAL; ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EINVAL; ++ goto err; + } + -+ old = bucket_cmpxchg(g, new, ({ -+ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, -+ new.dirty_sectors, new.cached_sectors); -+ if (ret) -+ return ret; ++ bucket_lock(g); ++ old = *g; + -+ if (parity) { -+ new.data_type = BCH_DATA_parity; -+ new.dirty_sectors = le16_to_cpu(s->sectors); -+ } ++ ret = check_bucket_ref(c, k, ptr, sectors, data_type, ++ new.gen, new.data_type, ++ new.dirty_sectors, new.cached_sectors); ++ if (ret) { ++ bucket_unlock(g); ++ goto err; ++ } + -+ if (journal_seq) { -+ new.journal_seq_valid = 1; -+ new.journal_seq = journal_seq; -+ } -+ })); ++ new.dirty_sectors += sectors; ++ if (data_type) ++ new.data_type = data_type; + + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + -+ bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); -+ return 0; ++ new = *g; ++ bucket_unlock(g); ++ ++ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); ++err: ++ percpu_up_read(&c->mark_lock); ++ printbuf_exit(&buf); ++ return ret; +} + +static int __mark_pointer(struct btree_trans *trans, @@ -27559,9 +29144,9 @@ index 000000000000..6fc93b56bcb2 + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, -+ u16 *dirty_sectors, u16 *cached_sectors) ++ u32 *dirty_sectors, u32 *cached_sectors) +{ -+ u16 *dst_sectors = !ptr->cached ++ u32 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, @@ -27583,64 +29168,64 @@ index 000000000000..6fc93b56bcb2 + s64 sectors, enum bch_data_type data_type, + unsigned flags) +{ -+ bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; -+ struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); ++ struct bucket old, new, *g; + u8 bucket_data_type; -+ u64 v; -+ int ret; ++ int ret = 0; + -+ v = atomic64_read(&g->_mark.v); -+ do { -+ new.v.counter = old.v.counter = v; -+ bucket_data_type = new.data_type; ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); + -+ ret = __mark_pointer(trans, k, &p.ptr, sectors, -+ data_type, new.gen, -+ &bucket_data_type, -+ &new.dirty_sectors, -+ &new.cached_sectors); -+ if (ret) -+ return ret; ++ percpu_down_read(&c->mark_lock); ++ g = PTR_GC_BUCKET(ca, &p.ptr); + -+ new.data_type = bucket_data_type; ++ bucket_lock(g); ++ old = *g; + -+ if (journal_seq) { -+ new.journal_seq_valid = 1; -+ new.journal_seq = journal_seq; -+ } ++ bucket_data_type = g->data_type; + -+ if (flags & BTREE_TRIGGER_NOATOMIC) { -+ g->_mark = new; -+ break; -+ } -+ } while ((v = atomic64_cmpxchg(&g->_mark.v, -+ old.v.counter, -+ new.v.counter)) != old.v.counter); ++ ret = __mark_pointer(trans, k, &p.ptr, sectors, ++ data_type, g->gen, ++ &bucket_data_type, ++ &g->dirty_sectors, ++ &g->cached_sectors); ++ if (ret) { ++ bucket_unlock(g); ++ goto err; ++ } + -+ bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); ++ g->data_type = bucket_data_type; + -+ BUG_ON(!gc && bucket_became_unavailable(old, new)); ++ new = *g; ++ bucket_unlock(g); + -+ return 0; ++ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); ++err: ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; +} + +static int bch2_mark_stripe_ptr(struct btree_trans *trans, ++ struct bkey_s_c k, + struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, + s64 sectors, + unsigned flags) +{ -+ bool gc = flags & BTREE_TRIGGER_GC; + struct bch_fs *c = trans->c; + struct bch_replicas_padded r; -+ struct stripe *m; -+ unsigned i, blocks_nonempty = 0; ++ struct gc_stripe *m; + -+ m = genradix_ptr(&c->stripes[gc], p.idx); ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ ++ m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); ++ if (!m) { ++ bch_err(c, "error allocating memory for gc_stripes, idx %llu", ++ (u64) p.idx); ++ return -ENOMEM; ++ } + + spin_lock(&c->ec_stripes_heap_lock); + @@ -27655,29 +29240,18 @@ index 000000000000..6fc93b56bcb2 + m->block_sectors[p.block] += sectors; + + r = m->r; -+ -+ for (i = 0; i < m->nr_blocks; i++) -+ blocks_nonempty += m->block_sectors[i] != 0; -+ -+ if (m->blocks_nonempty != blocks_nonempty) { -+ m->blocks_nonempty = blocks_nonempty; -+ if (!gc) -+ bch2_stripes_heap_update(c, m, p.idx); -+ } -+ + spin_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; -+ update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc); ++ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + + return 0; +} + -+static int bch2_mark_extent(struct btree_trans *trans, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++int bch2_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ -+ bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; @@ -27689,12 +29263,14 @@ index 000000000000..6fc93b56bcb2 + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) -+ ? c->opts.btree_node_size ++ ? btree_sectors(c) + : k.k->size; + s64 dirty_sectors = 0; + bool stale; + int ret; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; @@ -27713,18 +29289,19 @@ index 000000000000..6fc93b56bcb2 + stale = ret > 0; + + if (p.ptr.cached) { -+ if (!stale) -+ if (update_cached_sectors(c, p.ptr.dev, disk_sectors, -+ journal_seq, gc)) { ++ if (!stale) { ++ ret = update_cached_sectors(c, k, p.ptr.dev, ++ disk_sectors, journal_seq, true); ++ if (ret) { + bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); -+ return -1; -+ ++ return ret; + } ++ } + } else if (!p.has_ec) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { -+ ret = bch2_mark_stripe_ptr(trans, p.ec, data_type, ++ ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, + disk_sectors, flags); + if (ret) + return ret; @@ -27739,110 +29316,130 @@ index 000000000000..6fc93b56bcb2 + } + + if (r.e.nr_devs) { -+ if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) { -+ char buf[200]; ++ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); ++ if (ret) { ++ struct printbuf buf = PRINTBUF; + -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf); -+ return -1; ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); ++ printbuf_exit(&buf); ++ return ret; + } + } + + return 0; +} + -+static int bch2_mark_stripe(struct btree_trans *trans, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++int bch2_mark_stripe(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; -+ size_t idx = new.k->p.offset; ++ u64 idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; -+ struct stripe *m = genradix_ptr(&c->stripes[gc], idx); + unsigned i; + int ret; + + BUG_ON(gc && old_s); + -+ if (!m || (old_s && !m->alive)) { -+ char buf1[200], buf2[200]; ++ if (!gc) { ++ struct stripe *m = genradix_ptr(&c->stripes, idx); + -+ bch2_bkey_val_to_text(&PBUF(buf1), c, old); -+ bch2_bkey_val_to_text(&PBUF(buf2), c, new); -+ bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n" -+ "old %s\n" -+ "new %s", idx, buf1, buf2); -+ bch2_inconsistent_error(c); -+ return -1; -+ } ++ if (!m || (old_s && !m->alive)) { ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + -+ if (!new_s) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ bch2_stripes_heap_del(c, m, idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ memset(m, 0, sizeof(*m)); -+ } else { -+ m->alive = true; -+ m->sectors = le16_to_cpu(new_s->sectors); -+ m->algorithm = new_s->algorithm; -+ m->nr_blocks = new_s->nr_blocks; -+ m->nr_redundant = new_s->nr_redundant; -+ m->blocks_nonempty = 0; -+ -+ for (i = 0; i < new_s->nr_blocks; i++) { -+ m->block_sectors[i] = -+ stripe_blockcount_get(new_s, i); -+ m->blocks_nonempty += !!m->block_sectors[i]; -+ -+ m->ptrs[i] = new_s->ptrs[i]; ++ bch2_bkey_val_to_text(&buf1, c, old); ++ bch2_bkey_val_to_text(&buf2, c, new); ++ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" ++ "old %s\n" ++ "new %s", idx, buf1.buf, buf2.buf); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ bch2_inconsistent_error(c); ++ return -1; + } + -+ bch2_bkey_to_replicas(&m->r.e, new); ++ if (!new_s) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_del(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ memset(m, 0, sizeof(*m)); ++ } else { ++ m->alive = true; ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->algorithm = new_s->algorithm; ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) ++ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + -+ if (!gc) { + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); + } -+ } ++ } else { ++ struct gc_stripe *m = ++ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); ++ ++ if (!m) { ++ bch_err(c, "error allocating memory for gc_stripes, idx %llu", ++ idx); ++ return -ENOMEM; ++ } ++ /* ++ * This will be wrong when we bring back runtime gc: we should ++ * be unmarking the old key and then marking the new key ++ */ ++ m->alive = true; ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) ++ m->ptrs[i] = new_s->ptrs[i]; ++ ++ bch2_bkey_to_replicas(&m->r.e, new); + -+ if (gc) { + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); -+ m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { -+ ret = mark_stripe_bucket(trans, new, i, journal_seq, flags); ++ ret = mark_stripe_bucket(trans, new, i, flags); + if (ret) + return ret; + } + -+ if (update_replicas(c, &m->r.e, -+ ((s64) m->sectors * m->nr_redundant), -+ journal_seq, gc)) { -+ char buf[200]; ++ ret = update_replicas(c, new, &m->r.e, ++ ((s64) m->sectors * m->nr_redundant), ++ journal_seq, gc); ++ if (ret) { ++ struct printbuf buf = PRINTBUF; + -+ bch2_bkey_val_to_text(&PBUF(buf), c, new); -+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf); -+ return -1; ++ bch2_bkey_val_to_text(&buf, c, new); ++ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); ++ printbuf_exit(&buf); ++ return ret; + } + } + + return 0; +} + -+static int bch2_mark_inode(struct btree_trans *trans, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++int bch2_mark_inode(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage __percpu *fs_usage; @@ -27858,18 +29455,22 @@ index 000000000000..6fc93b56bcb2 + } + + if (flags & BTREE_TRIGGER_GC) { ++ percpu_down_read(&c->mark_lock); + preempt_disable(); ++ + fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); + fs_usage->nr_inodes += bkey_is_inode(new.k); + fs_usage->nr_inodes -= bkey_is_inode(old.k); ++ + preempt_enable(); ++ percpu_up_read(&c->mark_lock); + } + return 0; +} + -+static int bch2_mark_reservation(struct btree_trans *trans, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++int bch2_mark_reservation(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; @@ -27877,34 +29478,46 @@ index 000000000000..6fc93b56bcb2 + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; + ++ percpu_down_read(&c->mark_lock); + preempt_disable(); ++ + fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(fs_usage->persistent_reserved)); + + fs_usage->reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; ++ + preempt_enable(); ++ percpu_up_read(&c->mark_lock); + + return 0; +} + -+static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, ++static s64 __bch2_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, ++ u64 start, u64 end, + u64 *idx, unsigned flags, size_t r_idx) +{ ++ struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; ++ u64 next_idx = end; + s64 ret = 0; ++ struct printbuf buf = PRINTBUF; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; + + r = genradix_ptr(&c->reflink_gc_table, r_idx); -+ if (*idx < r->offset - r->size) ++ next_idx = min(next_idx, r->offset - r->size); ++ if (*idx < next_idx) + goto not_found; + + BUG_ON((s64) r->refcount + add < 0); @@ -27913,46 +29526,42 @@ index 000000000000..6fc93b56bcb2 + *idx = r->offset; + return 0; +not_found: -+ *idx = U64_MAX; -+ ret = -EIO; ++ if (fsck_err(c, "pointer to missing indirect extent\n" ++ " %s\n" ++ " missing range %llu-%llu", ++ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), ++ *idx, next_idx)) { ++ struct bkey_i_error new; + -+ /* -+ * XXX: we're replacing the entire reflink pointer with an error -+ * key, we should just be replacing the part that was missing: -+ */ -+ if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", -+ p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { -+ struct bkey_i_error *new; -+ -+ new = kmalloc(sizeof(*new), GFP_KERNEL); -+ if (!new) { -+ bch_err(c, "%s: error allocating new key", __func__); -+ return -ENOMEM; -+ } -+ -+ bkey_init(&new->k); -+ new->k.type = KEY_TYPE_error; -+ new->k.p = p.k->p; -+ new->k.size = p.k->size; -+ ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i); ++ bkey_init(&new.k); ++ new.k.type = KEY_TYPE_error; ++ new.k.p = bkey_start_pos(p.k); ++ new.k.p.offset += *idx - start; ++ bch2_key_resize(&new.k, next_idx - *idx); ++ ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i); + } ++ ++ *idx = next_idx; +fsck_err: ++ printbuf_exit(&buf); + return ret; +} + -+static int bch2_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++int bch2_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; -+ u64 idx = le64_to_cpu(p.v->idx); ++ u64 idx = le64_to_cpu(p.v->idx), start = idx; + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; + ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { + idx -= le32_to_cpu(p.v->front_pad); + end += le32_to_cpu(p.v->back_pad); @@ -27971,89 +29580,8 @@ index 000000000000..6fc93b56bcb2 + } + + while (idx < end && !ret) -+ ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++); -+ -+ return ret; -+} -+ -+static int bch2_mark_key_locked(struct btree_trans *trans, -+ struct bkey_s_c old, -+ struct bkey_s_c new, -+ unsigned flags) -+{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_alloc: -+ case KEY_TYPE_alloc_v2: -+ case KEY_TYPE_alloc_v3: -+ return bch2_mark_alloc(trans, old, new, flags); -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return bch2_mark_extent(trans, old, new, flags); -+ case KEY_TYPE_stripe: -+ return bch2_mark_stripe(trans, old, new, flags); -+ case KEY_TYPE_inode: -+ case KEY_TYPE_inode_v2: -+ return bch2_mark_inode(trans, old, new, flags); -+ case KEY_TYPE_reservation: -+ return bch2_mark_reservation(trans, old, new, flags); -+ case KEY_TYPE_reflink_p: -+ return bch2_mark_reflink_p(trans, old, new, flags); -+ case KEY_TYPE_snapshot: -+ return bch2_mark_snapshot(trans, old, new, flags); -+ default: -+ return 0; -+ } -+} -+ -+int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey deleted = KEY(0, 0, 0); -+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; -+ int ret; -+ -+ deleted.p = new.k->p; -+ -+ percpu_down_read(&c->mark_lock); -+ ret = bch2_mark_key_locked(trans, old, new, flags); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, -+ struct bkey_i *new, unsigned flags) -+{ -+ struct bkey _deleted = KEY(0, 0, 0); -+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; -+ struct bkey_s_c old; -+ struct bkey unpacked; -+ int ret; -+ -+ _deleted.p = path->pos; -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc(path->btree_id)) -+ return 0; -+ -+ old = bch2_btree_path_peek_slot(path, &unpacked); -+ -+ if (old.k->type == new->k.type && -+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { -+ ret = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new), -+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); -+ } else { -+ ret = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new), -+ BTREE_TRIGGER_INSERT|flags) ?: -+ bch2_mark_key_locked(trans, old, deleted, -+ BTREE_TRIGGER_OVERWRITE|flags); -+ } ++ ret = __bch2_mark_reflink_p(trans, p, start, end, ++ &idx, flags, l++); + + return ret; +} @@ -28065,50 +29593,42 @@ index 000000000000..6fc93b56bcb2 +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + + bch_err(c, "disk usage increased %lli more than %u sectors reserved", + should_not_have_added, disk_res_sectors); + + trans_for_each_update(trans, i) { ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ + pr_err("while inserting"); -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); -+ pr_err("%s", buf); ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); ++ pr_err(" %s", buf.buf); + pr_err("overlapping with"); -+ -+ if (!i->cached) { -+ struct bkey u; -+ struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u); -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ pr_err("%s", buf); -+ } else { -+ struct bkey_cached *ck = (void *) i->path->l[0].b; -+ -+ if (ck->valid) { -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); -+ pr_err("%s", buf); -+ } -+ } ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, old); ++ pr_err(" %s", buf.buf); + } ++ + __WARN(); ++ printbuf_exit(&buf); +} + -+void bch2_trans_fs_usage_apply(struct btree_trans *trans, -+ struct replicas_delta_list *deltas) ++int bch2_trans_fs_usage_apply(struct btree_trans *trans, ++ struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + static int warned_disk_usage = 0; + bool warn = false; + unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; -+ struct replicas_delta *d = deltas->d; ++ struct replicas_delta *d = deltas->d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + s64 added = 0, should_not_have_added; + unsigned i; + -+ percpu_rwsem_assert_held(&c->mark_lock); -+ ++ percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + @@ -28120,7 +29640,8 @@ index 000000000000..6fc93b56bcb2 + added += d->delta; + } + -+ BUG_ON(__update_replicas(c, dst, &d->r, d->delta)); ++ if (__update_replicas(c, dst, &d->r, d->delta)) ++ goto need_mark; + } + + dst->nr_inodes += deltas->nr_inodes; @@ -28155,74 +29676,44 @@ index 000000000000..6fc93b56bcb2 + } + + preempt_enable(); ++ percpu_up_read(&c->mark_lock); + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); ++ return 0; ++need_mark: ++ /* revert changes: */ ++ for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) ++ BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ return -1; +} + +/* trans_mark: */ + -+static struct bkey_alloc_buf * -+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, -+ const struct bch_extent_ptr *ptr, -+ struct bkey_alloc_unpacked *u) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); -+ struct bucket *g; -+ struct bkey_alloc_buf *a; -+ struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); -+ int ret; -+ -+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); -+ if (IS_ERR(a)) -+ return a; -+ -+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) { -+ bch2_trans_iter_exit(trans, iter); -+ return ERR_PTR(ret); -+ } -+ -+ if (update && !bpos_cmp(update->k.p, pos)) { -+ *u = bch2_alloc_unpack(bkey_i_to_s_c(update)); -+ } else { -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, pos.offset); -+ *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ return a; -+} -+ +static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) +{ -+ struct bch_fs *c = trans->c; + struct btree_iter iter; -+ struct bkey_alloc_unpacked u; -+ struct bkey_alloc_buf *a; ++ struct bkey_i_alloc_v4 *a; + int ret; + -+ a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); ++ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, -+ u.gen, &u.data_type, -+ &u.dirty_sectors, &u.cached_sectors); ++ a->v.gen, &a->v.data_type, ++ &a->v.dirty_sectors, &a->v.cached_sectors); + if (ret) + goto out; + -+ bch2_alloc_pack(c, a, u); -+ bch2_trans_update(trans, &iter, &a->k, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ if (ret) ++ goto out; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; @@ -28232,7 +29723,6 @@ index 000000000000..6fc93b56bcb2 + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) +{ -+ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_stripe *s; @@ -28248,16 +29738,15 @@ index 000000000000..6fc93b56bcb2 + goto err; + + if (k.k->type != KEY_TYPE_stripe) { -+ bch2_fs_inconsistent(c, ++ bch2_trans_inconsistent(trans, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); -+ bch2_inconsistent_error(c); + ret = -EIO; + goto err; + } + + if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { -+ bch2_fs_inconsistent(c, ++ bch2_trans_inconsistent(trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); + ret = -EIO; @@ -28273,7 +29762,10 @@ index 000000000000..6fc93b56bcb2 + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); -+ bch2_trans_update(trans, &iter, &s->k_i, 0); ++ ++ ret = bch2_trans_update(trans, &iter, &s->k_i, 0); ++ if (ret) ++ goto err; + + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; @@ -28283,10 +29775,14 @@ index 000000000000..6fc93b56bcb2 + return ret; +} + -+static int bch2_trans_mark_extent(struct btree_trans *trans, -+ struct bkey_s_c k, unsigned flags) ++int bch2_trans_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) +{ + struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; @@ -28295,7 +29791,7 @@ index 000000000000..6fc93b56bcb2 + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) -+ ? c->opts.btree_node_size ++ ? btree_sectors(c) + : k.k->size; + s64 dirty_sectors = 0; + bool stale; @@ -28341,119 +29837,158 @@ index 000000000000..6fc93b56bcb2 + return 0; +} + -+static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, -+ struct bkey_s_c_stripe s, -+ unsigned idx, bool deleting) ++static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, ++ struct bkey_s_c_stripe s, ++ unsigned idx, bool deleting) +{ + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; -+ struct bkey_alloc_buf *a; + struct btree_iter iter; -+ struct bkey_alloc_unpacked u; -+ bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; ++ struct bkey_i_alloc_v4 *a; ++ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant ++ ? BCH_DATA_parity : 0; ++ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + int ret = 0; + -+ a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); ++ if (deleting) ++ sectors = -sectors; ++ ++ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); + -+ if (parity) { -+ s64 sectors = le16_to_cpu(s.v->sectors); -+ -+ if (deleting) -+ sectors = -sectors; -+ -+ u.dirty_sectors += sectors; -+ u.data_type = u.dirty_sectors -+ ? BCH_DATA_parity -+ : 0; -+ } ++ ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, ++ a->v.gen, a->v.data_type, ++ a->v.dirty_sectors, a->v.cached_sectors); ++ if (ret) ++ goto err; + + if (!deleting) { -+ if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, -+ "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", -+ iter.pos.inode, iter.pos.offset, u.gen, -+ u.stripe, s.k->p.offset)) { ++ if (bch2_trans_inconsistent_on(a->v.stripe || ++ a->v.stripe_redundancy, trans, ++ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], ++ a->v.dirty_sectors, ++ a->v.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + -+ u.stripe = s.k->p.offset; -+ u.stripe_redundancy = s.v->nr_redundant; ++ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, ++ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], ++ a->v.dirty_sectors, ++ s.k->p.offset)) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ a->v.stripe = s.k->p.offset; ++ a->v.stripe_redundancy = s.v->nr_redundant; + } else { -+ u.stripe = 0; -+ u.stripe_redundancy = 0; ++ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || ++ a->v.stripe_redundancy != s.v->nr_redundant, trans, ++ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ s.k->p.offset, a->v.stripe)) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ a->v.stripe = 0; ++ a->v.stripe_redundancy = 0; + } + -+ bch2_alloc_pack(c, a, u); -+ bch2_trans_update(trans, &iter, &a->k, 0); ++ a->v.dirty_sectors += sectors; ++ if (data_type) ++ a->v.data_type = !deleting ? data_type : 0; ++ ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ if (ret) ++ goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + -+static int bch2_trans_mark_stripe(struct btree_trans *trans, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++int bch2_trans_mark_stripe(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) +{ -+ struct bkey_s_c_stripe old_s = { .k = NULL }; -+ struct bkey_s_c_stripe new_s = { .k = NULL }; ++ const struct bch_stripe *old_s = NULL; ++ struct bch_stripe *new_s = NULL; + struct bch_replicas_padded r; -+ unsigned i; ++ unsigned i, nr_blocks; + int ret = 0; + + if (old.k->type == KEY_TYPE_stripe) -+ old_s = bkey_s_c_to_stripe(old); -+ if (new.k->type == KEY_TYPE_stripe) -+ new_s = bkey_s_c_to_stripe(new); ++ old_s = bkey_s_c_to_stripe(old).v; ++ if (new->k.type == KEY_TYPE_stripe) ++ new_s = &bkey_i_to_stripe(new)->v; + + /* + * If the pointers aren't changing, we don't need to do anything: + */ -+ if (new_s.k && old_s.k && -+ new_s.v->nr_blocks == old_s.v->nr_blocks && -+ new_s.v->nr_redundant == old_s.v->nr_redundant && -+ !memcmp(old_s.v->ptrs, new_s.v->ptrs, -+ new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) ++ if (new_s && old_s && ++ new_s->nr_blocks == old_s->nr_blocks && ++ new_s->nr_redundant == old_s->nr_redundant && ++ !memcmp(old_s->ptrs, new_s->ptrs, ++ new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + -+ if (new_s.k) { -+ s64 sectors = le16_to_cpu(new_s.v->sectors); ++ BUG_ON(new_s && old_s && ++ (new_s->nr_blocks != old_s->nr_blocks || ++ new_s->nr_redundant != old_s->nr_redundant)); + -+ bch2_bkey_to_replicas(&r.e, new); -+ update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); ++ nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + -+ for (i = 0; i < new_s.v->nr_blocks; i++) { -+ ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s, -+ i, false); -+ if (ret) -+ return ret; -+ } ++ if (new_s) { ++ s64 sectors = le16_to_cpu(new_s->sectors); ++ ++ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); ++ update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + } + -+ if (old_s.k) { -+ s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors)); ++ if (old_s) { ++ s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + + bch2_bkey_to_replicas(&r.e, old); -+ update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); ++ update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); ++ } + -+ for (i = 0; i < old_s.v->nr_blocks; i++) { -+ ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s, -+ i, true); ++ for (i = 0; i < nr_blocks; i++) { ++ if (new_s && old_s && ++ !memcmp(&new_s->ptrs[i], ++ &old_s->ptrs[i], ++ sizeof(new_s->ptrs[i]))) ++ continue; ++ ++ if (new_s) { ++ ret = bch2_trans_mark_stripe_bucket(trans, ++ bkey_i_to_s_c_stripe(new), i, false); + if (ret) -+ return ret; ++ break; ++ } ++ ++ if (old_s) { ++ ret = bch2_trans_mark_stripe_bucket(trans, ++ bkey_s_c_to_stripe(old), i, true); ++ if (ret) ++ break; + } + } + + return ret; +} + -+static int bch2_trans_mark_inode(struct btree_trans *trans, -+ struct bkey_s_c old, -+ struct bkey_s_c new, -+ unsigned flags) ++int bch2_trans_mark_inode(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) +{ -+ int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); ++ int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); + + if (nr) { + struct replicas_delta_list *d = @@ -28464,9 +29999,14 @@ index 000000000000..6fc93b56bcb2 + return 0; +} + -+static int bch2_trans_mark_reservation(struct btree_trans *trans, -+ struct bkey_s_c k, unsigned flags) ++int bch2_trans_mark_reservation(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) +{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + struct replicas_delta_list *d; @@ -28494,7 +30034,7 @@ index 000000000000..6fc93b56bcb2 + struct bkey_i *n; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), @@ -28514,19 +30054,19 @@ index 000000000000..6fc93b56bcb2 + + refcount = bkey_refcount(n); + if (!refcount) { -+ bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); -+ bch2_fs_inconsistent(c, ++ bch2_bkey_val_to_text(&buf, c, p.s_c); ++ bch2_trans_inconsistent(trans, + "nonexistent indirect extent at %llu while marking\n %s", -+ *idx, buf); ++ *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { -+ bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); -+ bch2_fs_inconsistent(c, ++ bch2_bkey_val_to_text(&buf, c, p.s_c); ++ bch2_trans_inconsistent(trans, + "indirect extent refcount underflow at %llu while marking\n %s", -+ *idx, buf); ++ *idx, buf.buf); + ret = -EIO; + goto err; + } @@ -28548,11 +30088,6 @@ index 000000000000..6fc93b56bcb2 + + le64_add_cpu(refcount, add); + -+ if (!*refcount) { -+ n->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&n->k, 0); -+ } -+ + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, n, 0); + if (ret) @@ -28561,12 +30096,18 @@ index 000000000000..6fc93b56bcb2 + *idx = k.k->p.offset; +err: + bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); + return ret; +} + -+static int bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c k, unsigned flags) ++int bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) +{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx, end_idx; + int ret = 0; @@ -28587,31 +30128,6 @@ index 000000000000..6fc93b56bcb2 + return ret; +} + -+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, -+ struct bkey_s_c new, unsigned flags) -+{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return bch2_trans_mark_extent(trans, k, flags); -+ case KEY_TYPE_stripe: -+ return bch2_trans_mark_stripe(trans, old, new, flags); -+ case KEY_TYPE_inode: -+ case KEY_TYPE_inode_v2: -+ return bch2_trans_mark_inode(trans, old, new, flags); -+ case KEY_TYPE_reservation: -+ return bch2_trans_mark_reservation(trans, k, flags); -+ case KEY_TYPE_reflink_p: -+ return bch2_trans_mark_reflink_p(trans, k, flags); -+ default: -+ return 0; -+ } -+} -+ +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, @@ -28619,12 +30135,7 @@ index 000000000000..6fc93b56bcb2 +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; -+ struct bkey_alloc_unpacked u; -+ struct bkey_alloc_buf *a; -+ struct bch_extent_ptr ptr = { -+ .dev = ca->dev_idx, -+ .offset = bucket_to_sector(ca, b), -+ }; ++ struct bkey_i_alloc_v4 *a; + int ret = 0; + + /* @@ -28633,27 +30144,28 @@ index 000000000000..6fc93b56bcb2 + if (b >= ca->mi.nbuckets) + return 0; + -+ a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); ++ a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + if (IS_ERR(a)) + return PTR_ERR(a); + -+ if (u.data_type && u.data_type != type) { ++ if (a->v.data_type && a->v.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", -+ iter.pos.inode, iter.pos.offset, u.gen, -+ bch2_data_types[u.data_type], ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; + goto out; + } + -+ u.data_type = type; -+ u.dirty_sectors = sectors; ++ a->v.data_type = type; ++ a->v.dirty_sectors = sectors; + -+ bch2_alloc_pack(c, a, u); -+ bch2_trans_update(trans, &iter, &a->k, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ if (ret) ++ goto out; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; @@ -28814,54 +30326,31 @@ index 000000000000..6fc93b56bcb2 + +/* Startup/shutdown: */ + -+static void buckets_free_rcu(struct rcu_head *rcu) ++static void bucket_gens_free_rcu(struct rcu_head *rcu) +{ -+ struct bucket_array *buckets = -+ container_of(rcu, struct bucket_array, rcu); ++ struct bucket_gens *buckets = ++ container_of(rcu, struct bucket_gens, rcu); + -+ kvpfree(buckets, -+ sizeof(struct bucket_array) + -+ buckets->nbuckets * sizeof(struct bucket)); ++ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); +} + +int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) +{ -+ struct bucket_array *buckets = NULL, *old_buckets = NULL; ++ struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; + unsigned long *buckets_nouse = NULL; -+ alloc_fifo free[RESERVE_NR]; -+ alloc_fifo free_inc; -+ alloc_heap alloc_heap; -+ -+ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, -+ ca->mi.bucket_size / c->opts.btree_node_size); -+ /* XXX: these should be tunable */ -+ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); -+ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); -+ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), -+ btree_reserve * 2); -+ bool resize = ca->buckets[0] != NULL; ++ bool resize = ca->bucket_gens != NULL; + int ret = -ENOMEM; -+ unsigned i; + -+ memset(&free, 0, sizeof(free)); -+ memset(&free_inc, 0, sizeof(free_inc)); -+ memset(&alloc_heap, 0, sizeof(alloc_heap)); -+ -+ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + -+ nbuckets * sizeof(struct bucket), ++ if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO)) || -+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * ++ (c->opts.buckets_nouse && ++ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), -+ GFP_KERNEL|__GFP_ZERO)) || -+ !init_fifo(&free[RESERVE_MOVINGGC], -+ copygc_reserve, GFP_KERNEL) || -+ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || -+ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || -+ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) ++ GFP_KERNEL|__GFP_ZERO)))) + goto err; + -+ buckets->first_bucket = ca->mi.first_bucket; -+ buckets->nbuckets = nbuckets; ++ bucket_gens->first_bucket = ca->mi.first_bucket; ++ bucket_gens->nbuckets = nbuckets; + + bch2_copygc_stop(c); + @@ -28871,56 +30360,39 @@ index 000000000000..6fc93b56bcb2 + percpu_down_write(&c->mark_lock); + } + -+ old_buckets = bucket_array(ca); ++ old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); + + if (resize) { -+ size_t n = min(buckets->nbuckets, old_buckets->nbuckets); ++ size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); + -+ memcpy(buckets->b, -+ old_buckets->b, -+ n * sizeof(struct bucket)); -+ memcpy(buckets_nouse, -+ ca->buckets_nouse, -+ BITS_TO_LONGS(n) * sizeof(unsigned long)); ++ memcpy(bucket_gens->b, ++ old_bucket_gens->b, ++ n); ++ if (buckets_nouse) ++ memcpy(buckets_nouse, ++ ca->buckets_nouse, ++ BITS_TO_LONGS(n) * sizeof(unsigned long)); + } + -+ rcu_assign_pointer(ca->buckets[0], buckets); -+ buckets = old_buckets; ++ rcu_assign_pointer(ca->bucket_gens, bucket_gens); ++ bucket_gens = old_bucket_gens; + + swap(ca->buckets_nouse, buckets_nouse); + ++ nbuckets = ca->mi.nbuckets; ++ + if (resize) { + percpu_up_write(&c->mark_lock); ++ up_write(&ca->bucket_lock); + up_write(&c->gc_lock); + } + -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) { -+ fifo_move(&free[i], &ca->free[i]); -+ swap(ca->free[i], free[i]); -+ } -+ fifo_move(&free_inc, &ca->free_inc); -+ swap(ca->free_inc, free_inc); -+ spin_unlock(&c->freelist_lock); -+ -+ /* with gc lock held, alloc_heap can't be in use: */ -+ swap(ca->alloc_heap, alloc_heap); -+ -+ nbuckets = ca->mi.nbuckets; -+ -+ if (resize) -+ up_write(&ca->bucket_lock); -+ + ret = 0; +err: -+ free_heap(&alloc_heap); -+ free_fifo(&free_inc); -+ for (i = 0; i < RESERVE_NR; i++) -+ free_fifo(&free[i]); + kvpfree(buckets_nouse, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); -+ if (buckets) -+ call_rcu(&old_buckets->rcu, buckets_free_rcu); ++ if (bucket_gens) ++ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); + + return ret; +} @@ -28929,15 +30401,10 @@ index 000000000000..6fc93b56bcb2 +{ + unsigned i; + -+ free_heap(&ca->alloc_heap); -+ free_fifo(&ca->free_inc); -+ for (i = 0; i < RESERVE_NR; i++) -+ free_fifo(&ca->free[i]); + kvpfree(ca->buckets_nouse, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); -+ kvpfree(rcu_dereference_protected(ca->buckets[0], 1), -+ sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket)); ++ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), ++ sizeof(struct bucket_gens) + ca->mi.nbuckets); + + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); @@ -28962,10 +30429,10 @@ index 000000000000..6fc93b56bcb2 +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 -index 000000000000..5ed9441cb115 +index 000000000000..853bc9dd1294 --- /dev/null +++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,291 @@ +@@ -0,0 +1,298 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code for manipulating bucket marks for garbage collection. @@ -28983,57 +30450,49 @@ index 000000000000..5ed9441cb115 + for (_b = (_buckets)->b + (_buckets)->first_bucket; \ + _b < (_buckets)->b + (_buckets)->nbuckets; _b++) + -+#define bucket_cmpxchg(g, new, expr) \ -+({ \ -+ struct bucket *_g = g; \ -+ u64 _v = atomic64_read(&(g)->_mark.v); \ -+ struct bucket_mark _old; \ -+ \ -+ do { \ -+ (new).v.counter = _old.v.counter = _v; \ -+ expr; \ -+ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ -+ _old.v.counter, \ -+ (new).v.counter)) != _old.v.counter);\ -+ _old; \ -+}) -+ -+static inline struct bucket_array *__bucket_array(struct bch_dev *ca, -+ bool gc) ++static inline void bucket_unlock(struct bucket *b) +{ -+ return rcu_dereference_check(ca->buckets[gc], ++ smp_store_release(&b->lock, 0); ++} ++ ++static inline void bucket_lock(struct bucket *b) ++{ ++ while (xchg(&b->lock, 1)) ++ cpu_relax(); ++} ++ ++static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) ++{ ++ return rcu_dereference_check(ca->buckets_gc, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); +} + -+static inline struct bucket_array *bucket_array(struct bch_dev *ca) ++static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) +{ -+ return __bucket_array(ca, false); -+} -+ -+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) -+{ -+ struct bucket_array *buckets = __bucket_array(ca, gc); ++ struct bucket_array *buckets = gc_bucket_array(ca); + + BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + return buckets->b + b; +} + -+static inline struct bucket *bucket(struct bch_dev *ca, size_t b) ++static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) +{ -+ return __bucket(ca, b, false); ++ return rcu_dereference_check(ca->bucket_gens, ++ !ca->fs || ++ percpu_rwsem_is_held(&ca->fs->mark_lock) || ++ lockdep_is_held(&ca->fs->gc_lock) || ++ lockdep_is_held(&ca->bucket_lock)); +} + -+/* -+ * bucket_gc_gen() returns the difference between the bucket's current gen and -+ * the oldest gen of any pointer into that bucket in the btree. -+ */ -+ -+static inline u8 bucket_gc_gen(struct bucket *g) ++static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) +{ -+ return g->mark.gen - g->oldest_gen; ++ struct bucket_gens *gens = bucket_gens(ca); ++ ++ BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); ++ return gens->b + b; +} + +static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -29042,11 +30501,18 @@ index 000000000000..5ed9441cb115 + return sector_to_bucket(ca, ptr->offset); +} + -+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr, -+ bool gc) ++static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, ++ const struct bch_extent_ptr *ptr) +{ -+ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); ++} ++ ++static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); +} + +static inline enum bch_data_type ptr_data_type(const struct bkey *k, @@ -29059,18 +30525,6 @@ index 000000000000..5ed9441cb115 + return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; +} + -+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ struct bucket_mark m; -+ -+ rcu_read_lock(); -+ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); -+ rcu_read_unlock(); -+ -+ return m; -+} -+ +static inline int gen_cmp(u8 a, u8 b) +{ + return (s8) (a - b); @@ -29090,26 +30544,13 @@ index 000000000000..5ed9441cb115 +static inline u8 ptr_stale(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ -+ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); -+} ++ u8 ret; + -+/* bucket gc marks */ ++ rcu_read_lock(); ++ ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); ++ rcu_read_unlock(); + -+static inline unsigned bucket_sectors_used(struct bucket_mark mark) -+{ -+ return mark.dirty_sectors + mark.cached_sectors; -+} -+ -+static inline bool is_available_bucket(struct bucket_mark mark) -+{ -+ return !mark.dirty_sectors && !mark.stripe; -+} -+ -+static inline bool bucket_needs_journal_commit(struct bucket_mark m, -+ u16 last_seq_ondisk) -+{ -+ return m.journal_seq_valid && -+ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); ++ return ret; +} + +/* Device usage: */ @@ -29117,50 +30558,50 @@ index 000000000000..5ed9441cb115 +struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); + +static inline u64 __dev_buckets_available(struct bch_dev *ca, -+ struct bch_dev_usage stats) ++ struct bch_dev_usage stats, ++ enum alloc_reserve reserve) +{ -+ u64 total = ca->mi.nbuckets - ca->mi.first_bucket; ++ s64 total = ca->mi.nbuckets - ca->mi.first_bucket; ++ s64 reserved = 0; ++ ++ switch (reserve) { ++ case RESERVE_none: ++ reserved += ca->mi.nbuckets >> 6; ++ fallthrough; ++ case RESERVE_movinggc: ++ reserved += ca->nr_btree_reserve; ++ fallthrough; ++ case RESERVE_btree: ++ reserved += ca->nr_btree_reserve; ++ fallthrough; ++ case RESERVE_btree_movinggc: ++ break; ++ default: ++ BUG(); ++ } + + if (WARN_ONCE(stats.buckets_unavailable > total, + "buckets_unavailable overflow (%llu > %llu)\n", + stats.buckets_unavailable, total)) + return 0; + -+ return total - stats.buckets_unavailable; ++ return max_t(s64, 0, ++ total - ++ stats.buckets_unavailable - ++ ca->nr_open_buckets - ++ reserved); +} + -+static inline u64 dev_buckets_available(struct bch_dev *ca) ++static inline u64 dev_buckets_available(struct bch_dev *ca, ++ enum alloc_reserve reserve) +{ -+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); -+} -+ -+static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca, -+ struct bch_dev_usage stats) -+{ -+ struct bch_fs *c = ca->fs; -+ s64 available = __dev_buckets_available(ca, stats); -+ unsigned i; -+ -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) -+ available -= fifo_used(&ca->free[i]); -+ available -= fifo_used(&ca->free_inc); -+ available -= ca->nr_open_buckets; -+ spin_unlock(&c->freelist_lock); -+ -+ return max(available, 0LL); -+} -+ -+static inline u64 dev_buckets_reclaimable(struct bch_dev *ca) -+{ -+ return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca)); ++ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve); +} + +/* Filesystem usage: */ + +static inline unsigned fs_usage_u64s(struct bch_fs *c) +{ -+ + return sizeof(struct bch_fs_usage) / sizeof(u64) + + READ_ONCE(c->replicas.nr); +} @@ -29186,22 +30627,55 @@ index 000000000000..5ed9441cb115 + +/* key/bucket marking: */ + -+void bch2_bucket_seq_cleanup(struct bch_fs *); +void bch2_fs_usage_initialize(struct bch_fs *); + -+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); +void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + -+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned); ++int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); + -+int bch2_mark_update(struct btree_trans *, struct btree_path *, -+ struct bkey_i *, unsigned); ++int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); ++ ++int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); + +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, -+ struct bkey_s_c, unsigned); -+void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); ++ struct bkey_i *, unsigned); ++ ++static inline int bch2_trans_mark_old(struct btree_trans *trans, ++ struct bkey_s_c old, unsigned flags) ++{ ++ struct bkey_i deleted; ++ ++ bkey_init(&deleted.k); ++ deleted.k.p = old.k->p; ++ ++ return bch2_trans_mark_key(trans, old, &deleted, ++ BTREE_TRIGGER_OVERWRITE|flags); ++} ++ ++static inline int bch2_trans_mark_new(struct btree_trans *trans, ++ struct bkey_i *new, unsigned flags) ++{ ++ struct bkey_i deleted; ++ ++ bkey_init(&deleted.k); ++ deleted.k.p = new->k.p; ++ ++ return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new, ++ BTREE_TRIGGER_INSERT|flags); ++} ++ ++int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); @@ -29259,10 +30733,10 @@ index 000000000000..5ed9441cb115 +#endif /* _BUCKETS_H */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h new file mode 100644 -index 000000000000..b2de2995c5e7 +index 000000000000..e79a33795bf9 --- /dev/null +++ b/fs/bcachefs/buckets_types.h -@@ -0,0 +1,124 @@ +@@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_TYPES_H +#define _BUCKETS_TYPES_H @@ -29272,42 +30746,15 @@ index 000000000000..b2de2995c5e7 + +#define BUCKET_JOURNAL_SEQ_BITS 16 + -+struct bucket_mark { -+ union { -+ atomic64_t v; -+ -+ struct { -+ u8 gen; -+ u8 data_type:3, -+ owned_by_allocator:1, -+ journal_seq_valid:1, -+ stripe:1; -+ u16 dirty_sectors; -+ u16 cached_sectors; -+ -+ /* -+ * low bits of journal sequence number when this bucket was most -+ * recently modified: if journal_seq_valid is set, this bucket can't be -+ * reused until the journal sequence number written to disk is >= the -+ * bucket's journal sequence number: -+ */ -+ u16 journal_seq; -+ }; -+ }; -+}; -+ +struct bucket { -+ union { -+ struct bucket_mark _mark; -+ const struct bucket_mark mark; -+ }; -+ -+ u64 io_time[2]; -+ u8 oldest_gen; -+ u8 gc_gen; -+ unsigned gen_valid:1; -+ u8 stripe_redundancy; -+ u32 stripe; ++ u8 lock; ++ u8 gen_valid:1; ++ u8 data_type:7; ++ u8 gen; ++ u8 stripe_redundancy; ++ u32 stripe; ++ u32 dirty_sectors; ++ u32 cached_sectors; +}; + +struct bucket_array { @@ -29317,6 +30764,13 @@ index 000000000000..b2de2995c5e7 + struct bucket b[]; +}; + ++struct bucket_gens { ++ struct rcu_head rcu; ++ u16 first_bucket; ++ size_t nbuckets; ++ u8 b[]; ++}; ++ +struct bch_dev_usage { + u64 buckets_ec; + u64 buckets_unavailable; @@ -29379,7 +30833,7 @@ index 000000000000..b2de2995c5e7 + u8 dev; + u8 gen; + u8 replicas; -+ u16 fragmentation; ++ u32 fragmentation; + u32 sectors; + u64 offset; +}; @@ -29387,12 +30841,235 @@ index 000000000000..b2de2995c5e7 +typedef HEAP(struct copygc_heap_entry) copygc_heap; + +#endif /* _BUCKETS_TYPES_H */ +diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c +new file mode 100644 +index 000000000000..2e5b955080de +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal.c +@@ -0,0 +1,167 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets_waiting_for_journal.h" ++#include ++ ++static inline struct bucket_hashed * ++bucket_hash(struct buckets_waiting_for_journal_table *t, ++ unsigned hash_seed_idx, u64 dev_bucket) ++{ ++ unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]); ++ ++ BUG_ON(!is_power_of_2(t->size)); ++ ++ return t->d + (h & (t->size - 1)); ++} ++ ++static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size) ++{ ++ unsigned i; ++ ++ t->size = size; ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) ++ get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); ++ memset(t->d, 0, sizeof(t->d[0]) * size); ++} ++ ++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, ++ u64 flushed_seq, ++ unsigned dev, u64 bucket) ++{ ++ struct buckets_waiting_for_journal_table *t; ++ u64 dev_bucket = (u64) dev << 56 | bucket; ++ bool ret = false; ++ unsigned i; ++ ++ mutex_lock(&b->lock); ++ t = b->t; ++ ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { ++ struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); ++ ++ if (h->dev_bucket == dev_bucket) { ++ ret = h->journal_seq > flushed_seq; ++ break; ++ } ++ } ++ ++ mutex_unlock(&b->lock); ++ ++ return ret; ++} ++ ++static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, ++ struct bucket_hashed *new, ++ u64 flushed_seq) ++{ ++ struct bucket_hashed *last_evicted = NULL; ++ unsigned tries, i; ++ ++ for (tries = 0; tries < 10; tries++) { ++ struct bucket_hashed *old, *victim = NULL; ++ ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { ++ old = bucket_hash(t, i, new->dev_bucket); ++ ++ if (old->dev_bucket == new->dev_bucket || ++ old->journal_seq <= flushed_seq) { ++ *old = *new; ++ return true; ++ } ++ ++ if (last_evicted != old) ++ victim = old; ++ } ++ ++ /* hashed to same slot 3 times: */ ++ if (!victim) ++ break; ++ ++ /* Failed to find an empty slot: */ ++ swap(*new, *victim); ++ last_evicted = victim; ++ } ++ ++ return false; ++} ++ ++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, ++ u64 flushed_seq, ++ unsigned dev, u64 bucket, ++ u64 journal_seq) ++{ ++ struct buckets_waiting_for_journal_table *t, *n; ++ struct bucket_hashed tmp, new = { ++ .dev_bucket = (u64) dev << 56 | bucket, ++ .journal_seq = journal_seq, ++ }; ++ size_t i, new_size, nr_elements = 1, nr_rehashes = 0; ++ int ret = 0; ++ ++ mutex_lock(&b->lock); ++ ++ if (likely(bucket_table_insert(b->t, &new, flushed_seq))) ++ goto out; ++ ++ t = b->t; ++ for (i = 0; i < t->size; i++) ++ nr_elements += t->d[i].journal_seq > flushed_seq; ++ ++ new_size = nr_elements < t->size / 3 ? t->size : t->size * 2; ++ ++ n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++retry_rehash: ++ nr_rehashes++; ++ bucket_table_init(n, new_size); ++ ++ tmp = new; ++ BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); ++ ++ for (i = 0; i < t->size; i++) { ++ if (t->d[i].journal_seq <= flushed_seq) ++ continue; ++ ++ tmp = t->d[i]; ++ if (!bucket_table_insert(n, &tmp, flushed_seq)) ++ goto retry_rehash; ++ } ++ ++ b->t = n; ++ kvfree(t); ++ ++ pr_debug("took %zu rehashes, table at %zu/%zu elements", ++ nr_rehashes, nr_elements, b->t->size); ++out: ++ mutex_unlock(&b->lock); ++ ++ return ret; ++} ++ ++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) ++{ ++ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; ++ ++ kvfree(b->t); ++} ++ ++#define INITIAL_TABLE_SIZE 8 ++ ++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) ++{ ++ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; ++ ++ mutex_init(&b->lock); ++ ++ b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL); ++ if (!b->t) ++ return -ENOMEM; ++ ++ bucket_table_init(b->t, INITIAL_TABLE_SIZE); ++ return 0; ++} +diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h +new file mode 100644 +index 000000000000..d2ae19cbe18c +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H ++#define _BUCKETS_WAITING_FOR_JOURNAL_H ++ ++#include "buckets_waiting_for_journal_types.h" ++ ++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, ++ u64, unsigned, u64); ++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, ++ u64, unsigned, u64, u64); ++ ++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); ++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); ++ ++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ +diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h +new file mode 100644 +index 000000000000..fea7f944d0ed +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal_types.h +@@ -0,0 +1,23 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H ++#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H ++ ++#include ++ ++struct bucket_hashed { ++ u64 dev_bucket; ++ u64 journal_seq; ++}; ++ ++struct buckets_waiting_for_journal_table { ++ size_t size; ++ siphash_key_t hash_seeds[3]; ++ struct bucket_hashed d[]; ++}; ++ ++struct buckets_waiting_for_journal { ++ struct mutex lock; ++ struct buckets_waiting_for_journal_table *t; ++}; ++ ++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c new file mode 100644 -index 000000000000..db68a78276cf +index 000000000000..aa26588ed5ed --- /dev/null +++ b/fs/bcachefs/chardev.c -@@ -0,0 +1,758 @@ +@@ -0,0 +1,761 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_CHARDEV + @@ -29963,8 +31640,11 @@ index 000000000000..db68a78276cf + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ++ if (!dev) ++ return -EINVAL; ++ + for_each_online_member(ca, c, i) -+ if (ca->disk_sb.bdev->bd_dev == dev) { ++ if (ca->dev == dev) { + percpu_ref_put(&ca->io_ref); + return i; + } @@ -30190,10 +31870,10 @@ index 000000000000..3a4890d39ff9 +#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c new file mode 100644 -index 000000000000..fbe8603cfb30 +index 000000000000..425582f60d7a --- /dev/null +++ b/fs/bcachefs/checksum.c -@@ -0,0 +1,653 @@ +@@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" @@ -30289,9 +31969,9 @@ index 000000000000..fbe8603cfb30 + } +} + -+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ struct scatterlist *sg, size_t len) ++static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ struct scatterlist *sg, size_t len) +{ + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + int ret; @@ -30300,17 +31980,20 @@ index 000000000000..fbe8603cfb30 + skcipher_request_set_crypt(req, sg, sg, len, nonce.d); + + ret = crypto_skcipher_encrypt(req); -+ BUG_ON(ret); ++ if (ret) ++ pr_err("got error %i from crypto_skcipher_encrypt()", ret); ++ ++ return ret; +} + -+static inline void do_encrypt(struct crypto_sync_skcipher *tfm, ++static inline int do_encrypt(struct crypto_sync_skcipher *tfm, + struct nonce nonce, + void *buf, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, buf, len); -+ do_encrypt_sg(tfm, nonce, &sg, len); ++ return do_encrypt_sg(tfm, nonce, &sg, len); +} + +int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, @@ -30332,25 +32015,29 @@ index 000000000000..fbe8603cfb30 + goto err; + } + -+ do_encrypt(chacha20, nonce, buf, len); ++ ret = do_encrypt(chacha20, nonce, buf, len); +err: + crypto_free_sync_skcipher(chacha20); + return ret; +} + -+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, -+ struct nonce nonce) ++static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, ++ struct nonce nonce) +{ + u8 key[POLY1305_KEY_SIZE]; ++ int ret; + + nonce.d[3] ^= BCH_NONCE_POLY; + + memset(key, 0, sizeof(key)); -+ do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ if (ret) ++ return ret; + + desc->tfm = c->poly1305; + crypto_shash_init(desc); + crypto_shash_update(desc, key, sizeof(key)); ++ return 0; +} + +struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, @@ -30392,13 +32079,13 @@ index 000000000000..fbe8603cfb30 + } +} + -+void bch2_encrypt(struct bch_fs *c, unsigned type, ++int bch2_encrypt(struct bch_fs *c, unsigned type, + struct nonce nonce, void *data, size_t len) +{ + if (!bch2_csum_type_is_encryption(type)) -+ return; ++ return 0; + -+ do_encrypt(c->chacha20, nonce, data, len); ++ return do_encrypt(c->chacha20, nonce, data, len); +} + +static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, @@ -30473,23 +32160,27 @@ index 000000000000..fbe8603cfb30 + return __bch2_checksum_bio(c, type, nonce, bio, &iter); +} + -+void bch2_encrypt_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) ++int bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) +{ + struct bio_vec bv; + struct bvec_iter iter; + struct scatterlist sgl[16], *sg = sgl; + size_t bytes = 0; ++ int ret = 0; + + if (!bch2_csum_type_is_encryption(type)) -+ return; ++ return 0; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + + bio_for_each_segment(bv, bio, iter) { + if (sg == sgl + ARRAY_SIZE(sgl)) { + sg_mark_end(sg - 1); -+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ ++ ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ if (ret) ++ return ret; + + nonce = nonce_add(nonce, bytes); + bytes = 0; @@ -30503,7 +32194,7 @@ index 000000000000..fbe8603cfb30 + } + + sg_mark_end(sg - 1); -+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); +} + +struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, @@ -30603,16 +32294,12 @@ index 000000000000..fbe8603cfb30 +} + +#ifdef __KERNEL__ -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++static int __bch2_request_key(char *key_description, struct bch_key *key) +{ -+ char key_description[60]; + struct key *keyring_key; + const struct user_key_payload *ukp; + int ret; + -+ snprintf(key_description, sizeof(key_description), -+ "bcachefs:%pUb", &sb->user_uuid); -+ + keyring_key = request_key(&key_type_logon, key_description, NULL); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); @@ -30632,16 +32319,10 @@ index 000000000000..fbe8603cfb30 +} +#else +#include -+#include + -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++static int __bch2_request_key(char *key_description, struct bch_key *key) +{ + key_serial_t key_id; -+ char key_description[60]; -+ char uuid[40]; -+ -+ uuid_unparse_lower(sb->user_uuid.b, uuid); -+ sprintf(key_description, "bcachefs:%s", uuid); + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_USER_KEYRING); @@ -30655,6 +32336,17 @@ index 000000000000..fbe8603cfb30 +} +#endif + ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ char key_description[60]; ++ char uuid[40]; ++ ++ uuid_unparse_lower(sb->user_uuid.b, uuid); ++ sprintf(key_description, "bcachefs:%s", uuid); ++ ++ return __bch2_request_key(key_description, key); ++} ++ +int bch2_decrypt_sb_key(struct bch_fs *c, + struct bch_sb_field_crypt *crypt, + struct bch_key *key) @@ -30849,7 +32541,7 @@ index 000000000000..fbe8603cfb30 +} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h new file mode 100644 -index 000000000000..f5c1a609c5c4 +index 000000000000..c86c3c05d620 --- /dev/null +++ b/fs/bcachefs/checksum.h @@ -0,0 +1,204 @@ @@ -30904,7 +32596,7 @@ index 000000000000..f5c1a609c5c4 +int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); +int bch2_request_key(struct bch_sb *, struct bch_key *); + -+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, ++int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, + void *data, size_t); + +struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, @@ -30916,8 +32608,8 @@ index 000000000000..f5c1a609c5c4 + struct bch_extent_crc_unpacked *, + unsigned, unsigned, unsigned); + -+void bch2_encrypt_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); ++int bch2_encrypt_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); + +int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, + struct bch_key *); @@ -31343,10 +33035,10 @@ index 000000000000..5fae0012d808 +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 -index 000000000000..f63651d291e5 +index 000000000000..482fcff93b62 --- /dev/null +++ b/fs/bcachefs/compress.c -@@ -0,0 +1,640 @@ +@@ -0,0 +1,641 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" @@ -31375,7 +33067,7 @@ index 000000000000..f63651d291e5 +{ + void *b; + -+ BUG_ON(size > c->sb.encoded_extent_max << 9); ++ BUG_ON(size > c->opts.encoded_extent_max); + + b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); + if (b) @@ -31417,7 +33109,7 @@ index 000000000000..f63651d291e5 + struct page **pages = NULL; + void *data; + -+ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); ++ BUG_ON(start.bi_size > c->opts.encoded_extent_max); + + if (!PageHighMem(bio_iter_page(bio, start)) && + bio_phys_contig(bio, start)) @@ -31548,6 +33240,8 @@ index 000000000000..f63651d291e5 + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); + ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); + ++ src_len = le32_to_cpup(src_data.b); ++ + ret = ZSTD_decompressDCtx(ctx, + dst_data, dst_len, + src_data.b + 4, real_src_len); @@ -31580,8 +33274,8 @@ index 000000000000..f63651d291e5 + BUG_ON(!bio->bi_vcnt); + BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); + -+ if (crc->uncompressed_size > c->sb.encoded_extent_max || -+ crc->compressed_size > c->sb.encoded_extent_max) { ++ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || ++ crc->compressed_size << 9 > c->opts.encoded_extent_max) { + bch_err(c, "error rewriting existing data: extent too big"); + return -EIO; + } @@ -31621,8 +33315,8 @@ index 000000000000..f63651d291e5 + size_t dst_len = crc.uncompressed_size << 9; + int ret = -ENOMEM; + -+ if (crc.uncompressed_size > c->sb.encoded_extent_max || -+ crc.compressed_size > c->sb.encoded_extent_max) ++ if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || ++ crc.compressed_size << 9 > c->opts.encoded_extent_max) + return -EIO; + + dst_data = dst_len == dst_iter.bi_size @@ -31725,7 +33419,7 @@ index 000000000000..f63651d291e5 + BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + + /* If it's only one block, don't bother trying to compress: */ -+ if (bio_sectors(src) <= c->opts.block_size) ++ if (src->bi_iter.bi_size <= c->opts.block_size) + return 0; + + dst_data = bio_map_or_bounce(c, dst, WRITE); @@ -31815,7 +33509,7 @@ index 000000000000..f63651d291e5 + + /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ + src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, -+ c->sb.encoded_extent_max << 9); ++ c->opts.encoded_extent_max); + /* Don't generate a bigger output than input: */ + dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + @@ -31893,10 +33587,9 @@ index 000000000000..f63651d291e5 + +static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) +{ -+ size_t max_extent = c->sb.encoded_extent_max << 9; + size_t decompress_workspace_size = 0; + bool decompress_workspace_needed; -+ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); ++ ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0); + struct { + unsigned feature; + unsigned type; @@ -31928,14 +33621,14 @@ index 000000000000..f63651d291e5 + + if (!mempool_initialized(&c->compression_bounce[READ])) { + ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], -+ 1, max_extent); ++ 1, c->opts.encoded_extent_max); + if (ret) + goto out; + } + + if (!mempool_initialized(&c->compression_bounce[WRITE])) { + ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], -+ 1, max_extent); ++ 1, c->opts.encoded_extent_max); + if (ret) + goto out; + } @@ -32011,12 +33704,94 @@ index 000000000000..4bab1f61b3b5 +int bch2_fs_compress_init(struct bch_fs *); + +#endif /* _BCACHEFS_COMPRESS_H */ +diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h +new file mode 100644 +index 000000000000..745b1cdb0d17 +--- /dev/null ++++ b/fs/bcachefs/darray.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DARRAY_H ++#define _BCACHEFS_DARRAY_H ++ ++/* ++ * Dynamic arrays: ++ * ++ * Inspired by CCAN's darray ++ */ ++ ++#include "util.h" ++#include ++ ++#define DARRAY(type) \ ++struct { \ ++ size_t nr, size; \ ++ type *data; \ ++} ++ ++typedef DARRAY(void) darray_void; ++ ++static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) ++{ ++ if (d->nr + more > d->size) { ++ size_t new_size = roundup_pow_of_two(d->nr + more); ++ void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); ++ ++ if (!data) ++ return -ENOMEM; ++ ++ d->data = data; ++ d->size = new_size; ++ } ++ ++ return 0; ++} ++ ++#define darray_make_room(_d, _more) \ ++ __darray_make_room((darray_void *) &(_d), sizeof((_d).data[0]), (_more)) ++ ++#define darray_top(_d) ((_d).data[(_d).nr]) ++ ++#define darray_push(_d, _item) \ ++({ \ ++ int _ret = darray_make_room((_d), 1); \ ++ \ ++ if (!_ret) \ ++ (_d).data[(_d).nr++] = (_item); \ ++ _ret; \ ++}) ++ ++#define darray_insert_item(_d, _pos, _item) \ ++({ \ ++ int _ret = darray_make_room((_d), 1); \ ++ \ ++ if (!_ret) \ ++ array_insert_item((_d).data, (_d).nr, (_pos), (_item)); \ ++ _ret; \ ++}) ++ ++#define darray_for_each(_d, _i) \ ++ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) ++ ++#define darray_init(_d) \ ++do { \ ++ (_d).data = NULL; \ ++ (_d).nr = (_d).size = 0; \ ++} while (0) ++ ++#define darray_exit(_d) \ ++do { \ ++ kfree((_d).data); \ ++ darray_init(_d); \ ++} while (0) ++ ++#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c new file mode 100644 -index 000000000000..294e4baf4deb +index 000000000000..2d65ae370931 --- /dev/null +++ b/fs/bcachefs/debug.c -@@ -0,0 +1,476 @@ +@@ -0,0 +1,628 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Assorted bcachefs debug code @@ -32188,10 +33963,11 @@ index 000000000000..294e4baf4deb + failed |= bch2_btree_verify_replica(c, b, p); + + if (failed) { -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)); -+ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); ++ printbuf_exit(&buf); + } +out: + mutex_unlock(&c->verify_lock); @@ -32203,12 +33979,12 @@ index 000000000000..294e4baf4deb +/* XXX: bch_fs refcounting */ + +struct dump_iter { -+ struct bpos from; -+ struct bch_fs *c; ++ struct bch_fs *c; + enum btree_id id; ++ struct bpos from; ++ u64 iter; + -+ char buf[1 << 12]; -+ size_t bytes; /* what's currently in buf */ ++ struct printbuf buf; + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ @@ -32217,9 +33993,9 @@ index 000000000000..294e4baf4deb + +static int flush_buf(struct dump_iter *i) +{ -+ if (i->bytes) { -+ size_t bytes = min(i->bytes, i->size); -+ int err = copy_to_user(i->ubuf, i->buf, bytes); ++ if (i->buf.pos) { ++ size_t bytes = min_t(size_t, i->buf.pos, i->size); ++ int err = copy_to_user(i->ubuf, i->buf.buf, bytes); + + if (err) + return err; @@ -32227,8 +34003,8 @@ index 000000000000..294e4baf4deb + i->ret += bytes; + i->ubuf += bytes; + i->size -= bytes; -+ i->bytes -= bytes; -+ memmove(i->buf, i->buf + bytes, i->bytes); ++ i->buf.pos -= bytes; ++ memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); + } + + return 0; @@ -32245,15 +34021,20 @@ index 000000000000..294e4baf4deb + + file->private_data = i; + i->from = POS_MIN; ++ i->iter = 0; + i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); + i->id = bd->id; ++ i->buf = PRINTBUF; + + return 0; +} + +static int bch2_dump_release(struct inode *inode, struct file *file) +{ -+ kfree(file->private_data); ++ struct dump_iter *i = file->private_data; ++ ++ printbuf_exit(&i->buf); ++ kfree(i); + return 0; +} + @@ -32285,11 +34066,8 @@ index 000000000000..294e4baf4deb + k = bch2_btree_iter_peek(&iter); + + while (k.k && !(err = bkey_err(k))) { -+ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); -+ i->bytes = strlen(i->buf); -+ BUG_ON(i->bytes >= sizeof(i->buf)); -+ i->buf[i->bytes] = '\n'; -+ i->bytes++; ++ bch2_bkey_val_to_text(&i->buf, i->c, k); ++ pr_char(&i->buf, '\n'); + + k = bch2_btree_iter_next(&iter); + i->from = iter.pos; @@ -32338,8 +34116,7 @@ index 000000000000..294e4baf4deb + bch2_trans_init(&trans, i->c, 0, 0); + + for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) { -+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); -+ i->bytes = strlen(i->buf); ++ bch2_btree_node_to_text(&i->buf, i->c, b); + err = flush_buf(i); + if (err) + break; @@ -32392,7 +34169,9 @@ index 000000000000..294e4baf4deb + + bch2_trans_init(&trans, i->c, 0, 0); + -+ bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &iter, i->id, i->from, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !(err = bkey_err(k))) { @@ -32401,16 +34180,14 @@ index 000000000000..294e4baf4deb + bch2_btree_node_iter_peek(&l->iter, l->b); + + if (l->b != prev_node) { -+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); -+ i->bytes = strlen(i->buf); ++ bch2_btree_node_to_text(&i->buf, i->c, l->b); + err = flush_buf(i); + if (err) + break; + } + prev_node = l->b; + -+ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); -+ i->bytes = strlen(i->buf); ++ bch2_bfloat_to_text(&i->buf, l->b, _k); + err = flush_buf(i); + if (err) + break; @@ -32425,6 +34202,8 @@ index 000000000000..294e4baf4deb + if (!i->size) + break; + } ++ bch2_trans_iter_exit(&trans, &iter); ++ + bch2_trans_exit(&trans); + + return err < 0 ? err : i->ret; @@ -32437,10 +34216,148 @@ index 000000000000..294e4baf4deb + .read = bch2_read_bfloat_failed, +}; + ++static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ out->tabstops[0] = 32; ++ ++ pr_buf(out, "%px btree=%s l=%u ", ++ b, ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level); ++ pr_newline(out); ++ ++ pr_indent_push(out, 2); ++ ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ pr_newline(out); ++ ++ pr_buf(out, "flags: "); ++ pr_tab(out); ++ bch2_flags_to_text(out, bch2_btree_node_flags, b->flags); ++ pr_newline(out); ++ ++ pr_buf(out, "written:"); ++ pr_tab(out); ++ pr_buf(out, "%u", b->written); ++ pr_newline(out); ++ ++ pr_buf(out, "writes blocked:"); ++ pr_tab(out); ++ pr_buf(out, "%u", !list_empty_careful(&b->write_blocked)); ++ pr_newline(out); ++ ++ pr_buf(out, "will make reachable:"); ++ pr_tab(out); ++ pr_buf(out, "%lx", b->will_make_reachable); ++ pr_newline(out); ++ ++ pr_buf(out, "journal pin %px:", &b->writes[0].journal); ++ pr_tab(out); ++ pr_buf(out, "%llu", b->writes[0].journal.seq); ++ pr_newline(out); ++ ++ pr_buf(out, "journal pin %px:", &b->writes[1].journal); ++ pr_tab(out); ++ pr_buf(out, "%llu", b->writes[1].journal.seq); ++ pr_newline(out); ++ ++ pr_indent_pop(out, 2); ++} ++ ++static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct bch_fs *c = i->c; ++ bool done = false; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ do { ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ rcu_read_lock(); ++ i->buf.atomic++; ++ tbl = rht_dereference_rcu(c->btree_cache.table.tbl, ++ &c->btree_cache.table); ++ if (i->iter < tbl->size) { ++ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) ++ bch2_cached_btree_node_to_text(&i->buf, c, b); ++ i->iter++;; ++ } else { ++ done = true; ++ } ++ --i->buf.atomic; ++ rcu_read_unlock(); ++ } while (!done); ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations cached_btree_nodes_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_cached_btree_nodes_read, ++}; ++ ++static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct bch_fs *c = i->c; ++ bool done = false; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ do { ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); ++ i->iter++; ++ } while (!done); ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations journal_pins_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_journal_pins_read, ++}; ++ +void bch2_fs_debug_exit(struct bch_fs *c) +{ -+ if (!IS_ERR_OR_NULL(c->debug)) -+ debugfs_remove_recursive(c->debug); ++ if (!IS_ERR_OR_NULL(c->fs_debug_dir)) ++ debugfs_remove_recursive(c->fs_debug_dir); +} + +void bch2_fs_debug_init(struct bch_fs *c) @@ -32452,29 +34369,39 @@ index 000000000000..294e4baf4deb + return; + + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); -+ c->debug = debugfs_create_dir(name, bch_debug); -+ if (IS_ERR_OR_NULL(c->debug)) ++ c->fs_debug_dir = debugfs_create_dir(name, bch_debug); ++ if (IS_ERR_OR_NULL(c->fs_debug_dir)) ++ return; ++ ++ debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, ++ c->btree_debug, &cached_btree_nodes_ops); ++ ++ debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, ++ c->btree_debug, &journal_pins_ops); ++ ++ c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); ++ if (IS_ERR_OR_NULL(c->btree_debug_dir)) + return; + + for (bd = c->btree_debug; + bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); + bd++) { + bd->id = bd - c->btree_debug; -+ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], -+ 0400, c->debug, bd, -+ &btree_debug_ops); ++ debugfs_create_file(bch2_btree_ids[bd->id], ++ 0400, c->btree_debug_dir, bd, ++ &btree_debug_ops); + + snprintf(name, sizeof(name), "%s-formats", + bch2_btree_ids[bd->id]); + -+ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, -+ &btree_format_debug_ops); ++ debugfs_create_file(name, 0400, c->btree_debug_dir, bd, ++ &btree_format_debug_ops); + + snprintf(name, sizeof(name), "%s-bfloat-failed", + bch2_btree_ids[bd->id]); + -+ bd->failed = debugfs_create_file(name, 0400, c->debug, bd, -+ &bfloat_failed_debug_ops); ++ debugfs_create_file(name, 0400, c->btree_debug_dir, bd, ++ &bfloat_failed_debug_ops); + } +} + @@ -32531,10 +34458,10 @@ index 000000000000..0b86736e5e1b +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c new file mode 100644 -index 000000000000..fe4a85a6a8cb +index 000000000000..760e4f74715f --- /dev/null +++ b/fs/bcachefs/dirent.c -@@ -0,0 +1,546 @@ +@@ -0,0 +1,545 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -32659,9 +34586,9 @@ index 000000000000..fe4a85a6a8cb +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + -+ bch_scnmemcpy(out, d.v->d_name, -+ bch2_dirent_name_bytes(d)); -+ pr_buf(out, " -> %llu type %s", ++ pr_buf(out, "%.*s -> %llu type %s", ++ bch2_dirent_name_bytes(d), ++ d.v->d_name, + d.v->d_type != DT_SUBVOL + ? le64_to_cpu(d.v->d_inum) + : le32_to_cpu(d.v->d_child_subvol), @@ -32767,7 +34694,7 @@ index 000000000000..fe4a85a6a8cb +{ + struct btree_iter src_iter = { NULL }; + struct btree_iter dst_iter = { NULL }; -+ struct bkey_s_c old_src, old_dst; ++ struct bkey_s_c old_src, old_dst = bkey_s_c_null; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); @@ -32904,7 +34831,9 @@ index 000000000000..fe4a85a6a8cb + } + } + -+ bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); ++ ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); ++ if (ret) ++ goto out; +out_set_src: + + /* @@ -32921,7 +34850,9 @@ index 000000000000..fe4a85a6a8cb + src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + } + -+ bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); ++ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); ++ if (ret) ++ goto out; + + if (mode == BCH_RENAME_EXCHANGE) + *src_offset = new_src->k.p.offset; @@ -33003,16 +34934,13 @@ index 000000000000..fe4a85a6a8cb + if (ret) + return ret; + -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents, -+ SPOS(dir.inum, 0, snapshot), 0, k, ret) { -+ if (k.k->p.inode > dir.inum) -+ break; -+ ++ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, ++ SPOS(dir.inum, 0, snapshot), ++ POS(dir.inum, U64_MAX), 0, k, ret) + if (k.k->type == KEY_TYPE_dirent) { + ret = -ENOTEMPTY; + break; + } -+ } + bch2_trans_iter_exit(trans, &iter); + + return ret; @@ -33036,11 +34964,9 @@ index 000000000000..fe4a85a6a8cb + if (ret) + goto err; + -+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents, -+ SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { -+ if (k.k->p.inode > inum.inum) -+ break; -+ ++ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, ++ SPOS(inum.inum, ctx->pos, snapshot), ++ POS(inum.inum, U64_MAX), 0, k, ret) { + if (k.k->type != KEY_TYPE_dirent) + continue; + @@ -33156,10 +35082,10 @@ index 000000000000..1bb4d802bc1d +#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c new file mode 100644 -index 000000000000..c52b6faac9b4 +index 000000000000..81b41b07c24b --- /dev/null +++ b/fs/bcachefs/disk_groups.c -@@ -0,0 +1,486 @@ +@@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "disk_groups.h" @@ -33179,24 +35105,20 @@ index 000000000000..c52b6faac9b4 + strncmp(l->label, r->label, sizeof(l->label)); +} + -+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) ++static int bch2_sb_disk_groups_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) +{ + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_disk_group *g, *sorted = NULL; -+ struct bch_sb_field_members *mi; -+ struct bch_member *m; -+ unsigned i, nr_groups, len; -+ const char *err = NULL; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ unsigned nr_groups = disk_groups_nr(groups); ++ unsigned i, len; ++ int ret = -EINVAL; + -+ mi = bch2_sb_get_members(sb); -+ groups = bch2_sb_get_disk_groups(sb); -+ nr_groups = disk_groups_nr(groups); -+ -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) { ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; + unsigned g; + + if (!BCH_MEMBER_GROUP(m)) @@ -33204,45 +35126,54 @@ index 000000000000..c52b6faac9b4 + + g = BCH_MEMBER_GROUP(m) - 1; + -+ if (g >= nr_groups || -+ BCH_GROUP_DELETED(&groups->entries[g])) -+ return "disk has invalid group"; ++ if (g >= nr_groups) { ++ pr_buf(err, "disk %u has invalid label %u (have %u)", ++ i, g, nr_groups); ++ return -EINVAL; ++ } ++ ++ if (BCH_GROUP_DELETED(&groups->entries[g])) { ++ pr_buf(err, "disk %u has deleted label %u", i, g); ++ return -EINVAL; ++ } + } + + if (!nr_groups) -+ return NULL; ++ return 0; ++ ++ for (i = 0; i < nr_groups; i++) { ++ g = groups->entries + i; + -+ for (g = groups->entries; -+ g < groups->entries + nr_groups; -+ g++) { + if (BCH_GROUP_DELETED(g)) + continue; + + len = strnlen(g->label, sizeof(g->label)); + if (!len) { -+ err = "group with empty label"; -+ goto err; ++ pr_buf(err, "label %u empty", i); ++ return -EINVAL; + } + } + + sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); + if (!sorted) -+ return "cannot allocate memory"; ++ return -ENOMEM; + + memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); + sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); + -+ for (i = 0; i + 1 < nr_groups; i++) -+ if (!BCH_GROUP_DELETED(sorted + i) && -+ !group_cmp(sorted + i, sorted + i + 1)) { -+ err = "duplicate groups"; ++ for (g = sorted; g + 1 < sorted + nr_groups; g++) ++ if (!BCH_GROUP_DELETED(g) && ++ !group_cmp(&g[0], &g[1])) { ++ pr_buf(err, "duplicate label %llu.%.*s", ++ BCH_GROUP_PARENT(g), ++ (int) sizeof(g->label), g->label); + goto err; + } + -+ err = NULL; ++ ret = 0; +err: + kfree(sorted); -+ return err; ++ return 0; +} + +static void bch2_sb_disk_groups_to_text(struct printbuf *out, @@ -33500,12 +35431,10 @@ index 000000000000..c52b6faac9b4 + return v; +} + -+void bch2_disk_path_to_text(struct printbuf *out, -+ struct bch_sb_handle *sb, -+ unsigned v) ++void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) +{ + struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); ++ bch2_sb_get_disk_groups(sb); + struct bch_disk_group *g; + unsigned nr = 0; + u16 path[32]; @@ -33534,15 +35463,13 @@ index 000000000000..c52b6faac9b4 + v = path[--nr]; + g = groups->entries + v; + -+ bch_scnmemcpy(out, g->label, -+ strnlen(g->label, sizeof(g->label))); -+ ++ pr_buf(out, "%.*s", (int) sizeof(g->label), g->label); + if (nr) + pr_buf(out, "."); + } + return; +inval: -+ pr_buf(out, "invalid group %u", v); ++ pr_buf(out, "invalid label %u", v); +} + +int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) @@ -33606,7 +35533,10 @@ index 000000000000..c52b6faac9b4 + return -EINVAL; +} + -+void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) ++void bch2_opt_target_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_sb *sb, ++ u64 v) +{ + struct target t = target_decode(v); + @@ -33614,33 +35544,49 @@ index 000000000000..c52b6faac9b4 + case TARGET_NULL: + pr_buf(out, "none"); + break; -+ case TARGET_DEV: { -+ struct bch_dev *ca; ++ case TARGET_DEV: ++ if (c) { ++ struct bch_dev *ca; + -+ rcu_read_lock(); -+ ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; ++ rcu_read_lock(); ++ ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; + -+ if (ca && percpu_ref_tryget(&ca->io_ref)) { -+ char b[BDEVNAME_SIZE]; ++ if (ca && percpu_ref_tryget(&ca->io_ref)) { ++ char b[BDEVNAME_SIZE]; + -+ pr_buf(out, "/dev/%s", -+ bdevname(ca->disk_sb.bdev, b)); -+ percpu_ref_put(&ca->io_ref); -+ } else if (ca) { -+ pr_buf(out, "offline device %u", t.dev); ++ pr_buf(out, "/dev/%s", ++ bdevname(ca->disk_sb.bdev, b)); ++ percpu_ref_put(&ca->io_ref); ++ } else if (ca) { ++ pr_buf(out, "offline device %u", t.dev); ++ } else { ++ pr_buf(out, "invalid device %u", t.dev); ++ } ++ ++ rcu_read_unlock(); + } else { -+ pr_buf(out, "invalid device %u", t.dev); -+ } ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_member *m = mi->members + t.dev; + -+ rcu_read_unlock(); ++ if (bch2_dev_exists(sb, mi, t.dev)) { ++ pr_buf(out, "Device "); ++ pr_uuid(out, m->uuid.b); ++ pr_buf(out, " (%u)", t.dev); ++ } else { ++ pr_buf(out, "Bad device %u", t.dev); ++ } ++ } + break; -+ } + case TARGET_GROUP: -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(out, &c->disk_sb, t.group); -+ mutex_unlock(&c->sb_lock); ++ if (c) { ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); ++ mutex_unlock(&c->sb_lock); ++ } else { ++ bch2_disk_path_to_text(out, sb, t.group); ++ } + break; + default: + BUG(); @@ -33648,10 +35594,10 @@ index 000000000000..c52b6faac9b4 +} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h new file mode 100644 -index 000000000000..3d84f23c34ed +index 000000000000..de915480514b --- /dev/null +++ b/fs/bcachefs/disk_groups.h -@@ -0,0 +1,91 @@ +@@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_H +#define _BCACHEFS_DISK_GROUPS_H @@ -33729,11 +35675,10 @@ index 000000000000..3d84f23c34ed +/* Exported for userspace bcachefs-tools: */ +int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); + -+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, -+ unsigned); ++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); + +int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); -+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); ++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + +int bch2_sb_disk_groups_to_cpu(struct bch_fs *); + @@ -33745,10 +35690,10 @@ index 000000000000..3d84f23c34ed +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 -index 000000000000..bca1b8a7b673 +index 000000000000..616a551265e0 --- /dev/null +++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1780 @@ +@@ -0,0 +1,1682 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ @@ -33766,6 +35711,7 @@ index 000000000000..bca1b8a7b673 +#include "io.h" +#include "keylist.h" +#include "recovery.h" ++#include "replicas.h" +#include "super-io.h" +#include "util.h" + @@ -33893,8 +35839,8 @@ index 000000000000..bca1b8a7b673 +} + +/* returns blocknr in stripe that we matched: */ -+static int bkey_matches_stripe(struct bch_stripe *s, -+ struct bkey_s_c k) ++static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, ++ struct bkey_s_c k, unsigned *block) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; @@ -33903,10 +35849,12 @@ index 000000000000..bca1b8a7b673 + bkey_for_each_ptr(ptrs, ptr) + for (i = 0; i < nr_data; i++) + if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, -+ le16_to_cpu(s->sectors))) -+ return i; ++ le16_to_cpu(s->sectors))) { ++ *block = i; ++ return ptr; ++ } + -+ return -1; ++ return NULL; +} + +static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) @@ -34034,14 +35982,15 @@ index 000000000000..bca1b8a7b673 + struct bch_csum got = ec_block_checksum(buf, i, offset); + + if (bch2_crc_cmp(want, got)) { -+ char buf2[200]; ++ struct printbuf buf2 = PRINTBUF; + -+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i)); ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i)); + + bch_err_ratelimited(c, + "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", + (void *) _RET_IP_, i, j, v->csum_type, -+ want.lo, got.lo, buf2); ++ want.lo, got.lo, buf2.buf); ++ printbuf_exit(&buf2); + clear_bit(i, buf->valid); + break; + } @@ -34295,11 +36244,11 @@ index 000000000000..bca1b8a7b673 + free_heap(&n); + } + -+ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) ++ if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) + return -ENOMEM; + + if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && -+ !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) ++ !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) + return -ENOMEM; + + return 0; @@ -34344,13 +36293,13 @@ index 000000000000..bca1b8a7b673 +{ + struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); + -+ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; ++ genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; +} + +static void heap_verify_backpointer(struct bch_fs *c, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m = genradix_ptr(&c->stripes[0], idx); ++ struct stripe *m = genradix_ptr(&c->stripes, idx); + + BUG_ON(!m->alive); + BUG_ON(m->heap_idx >= h->used); @@ -34425,7 +36374,7 @@ index 000000000000..bca1b8a7b673 + return bch2_btree_delete_range(c, BTREE_ID_stripes, + POS(0, idx), + POS(0, idx + 1), -+ NULL); ++ 0, NULL); +} + +static void ec_stripe_delete_work(struct work_struct *work) @@ -34442,7 +36391,7 @@ index 000000000000..bca1b8a7b673 + break; + } + -+ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); ++ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx); + spin_unlock(&c->ec_stripes_heap_lock); + + if (ec_stripe_delete(c, idx)) @@ -34452,22 +36401,18 @@ index 000000000000..bca1b8a7b673 + +/* stripe creation: */ + -+static int ec_stripe_bkey_insert(struct bch_fs *c, ++static int ec_stripe_bkey_insert(struct btree_trans *trans, + struct bkey_i_stripe *stripe, + struct disk_reservation *res) +{ -+ struct btree_trans trans; ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); + int ret; + -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos, ++ for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + if (start_pos.offset) { @@ -34488,29 +36433,24 @@ index 000000000000..bca1b8a7b673 +found_slot: + start_pos = iter.pos; + -+ ret = ec_stripe_mem_alloc(&trans, &iter); ++ ret = ec_stripe_mem_alloc(trans, &iter); + if (ret) + goto err; + + stripe->k.p = iter.pos; + -+ ret = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?: -+ bch2_trans_commit(&trans, res, NULL, -+ BTREE_INSERT_NOFAIL); ++ ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0); ++ ++ c->ec_stripe_hint = start_pos.offset; +err: -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; -+ bch2_trans_exit(&trans); ++ bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int ec_stripe_bkey_update(struct btree_trans *trans, -+ struct bkey_i_stripe *new) ++ struct bkey_i_stripe *new, ++ struct disk_reservation *res) +{ + struct btree_iter iter; + struct bkey_s_c k; @@ -34593,6 +36533,7 @@ index 000000000000..bca1b8a7b673 + (k = bch2_btree_iter_peek(&iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { ++ const struct bch_extent_ptr *ptr_c; + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { @@ -34600,8 +36541,12 @@ index 000000000000..bca1b8a7b673 + continue; + } + -+ block = bkey_matches_stripe(&s->key.v, k); -+ if (block < 0) { ++ ptr_c = bkey_matches_stripe(&s->key.v, k, &block); ++ /* ++ * It doesn't generally make sense to erasure code cached ptrs: ++ * XXX: should we be incrementing a counter? ++ */ ++ if (!ptr_c || ptr_c->cached) { + bch2_btree_iter_advance(&iter); + continue; + } @@ -34697,10 +36642,10 @@ index 000000000000..bca1b8a7b673 + goto err_put_writes; + } + -+ ret = s->have_existing_stripe -+ ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, -+ ec_stripe_bkey_update(&trans, &s->new_stripe.key)) -+ : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res); ++ ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, ++ s->have_existing_stripe ++ ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res) ++ : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res)); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_put_writes; @@ -34715,7 +36660,7 @@ index 000000000000..bca1b8a7b673 + } + + spin_lock(&c->ec_stripes_heap_lock); -+ m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset); ++ m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset); + + BUG_ON(m->on_heap); + bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); @@ -34815,7 +36760,7 @@ index 000000000000..bca1b8a7b673 + if (!ob) + return NULL; + -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ca = bch_dev_bkey_exists(c, ob->dev); + offset = ca->mi.bucket_size - ob->sectors_free; + + return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); @@ -34904,7 +36849,7 @@ index 000000000000..bca1b8a7b673 + s->v.algorithm = 0; + s->v.nr_blocks = nr_data + nr_parity; + s->v.nr_redundant = nr_parity; -+ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); ++ s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); + s->v.csum_type = BCH_CSUM_crc32c; + s->v.pad = 0; + @@ -35023,16 +36968,15 @@ index 000000000000..bca1b8a7b673 + return h; +} + -+static enum bucket_alloc_ret -+new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, -+ struct closure *cl) ++static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, ++ struct closure *cl) +{ + struct bch_devs_mask devs = h->devs; + struct open_bucket *ob; + struct open_buckets buckets; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; + bool have_cache = true; -+ enum bucket_alloc_ret ret = ALLOC_SUCCESS; ++ int ret = 0; + + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (test_bit(i, h->s->blocks_gotten)) { @@ -35047,9 +36991,6 @@ index 000000000000..bca1b8a7b673 + BUG_ON(nr_have_data > h->s->nr_data); + BUG_ON(nr_have_parity > h->s->nr_parity); + -+ percpu_down_read(&c->mark_lock); -+ rcu_read_lock(); -+ + buckets.nr = 0; + if (nr_have_parity < h->s->nr_parity) { + ret = bch2_bucket_alloc_set(c, &buckets, @@ -35059,8 +37000,8 @@ index 000000000000..bca1b8a7b673 + &nr_have_parity, + &have_cache, + h->copygc -+ ? RESERVE_MOVINGGC -+ : RESERVE_NONE, ++ ? RESERVE_movinggc ++ : RESERVE_none, + 0, + cl); + @@ -35071,12 +37012,12 @@ index 000000000000..bca1b8a7b673 + BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + + h->s->blocks[j] = buckets.v[i]; -+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr; ++ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + + if (ret) -+ goto err; ++ return ret; + } + + buckets.nr = 0; @@ -35088,8 +37029,8 @@ index 000000000000..bca1b8a7b673 + &nr_have_data, + &have_cache, + h->copygc -+ ? RESERVE_MOVINGGC -+ : RESERVE_NONE, ++ ? RESERVE_movinggc ++ : RESERVE_none, + 0, + cl); + @@ -35099,17 +37040,15 @@ index 000000000000..bca1b8a7b673 + BUG_ON(j >= h->s->nr_data); + + h->s->blocks[j] = buckets.v[i]; -+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr; ++ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + + if (ret) -+ goto err; ++ return ret; + } -+err: -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ return ret; ++ ++ return 0; +} + +/* XXX: doesn't obey target: */ @@ -35132,7 +37071,7 @@ index 000000000000..bca1b8a7b673 + continue; + + stripe_idx = h->data[heap_idx].idx; -+ m = genradix_ptr(&c->stripes[0], stripe_idx); ++ m = genradix_ptr(&c->stripes, stripe_idx); + + if (m->algorithm == head->algo && + m->nr_redundant == head->redundancy && @@ -35267,7 +37206,7 @@ index 000000000000..bca1b8a7b673 + +err: + bch2_ec_stripe_head_put(c, h); -+ return ERR_PTR(-ret); ++ return ERR_PTR(ret); +} + +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) @@ -35288,7 +37227,7 @@ index 000000000000..bca1b8a7b673 + continue; + + ob = c->open_buckets + h->s->blocks[i]; -+ if (ob->ptr.dev == ca->dev_idx) ++ if (ob->dev == ca->dev_idx) + goto found; + } + goto unlock; @@ -35306,151 +37245,59 @@ index 000000000000..bca1b8a7b673 + struct genradix_iter iter; + struct stripe *m; + -+ genradix_for_each(&c->stripes[0], iter, m) ++ genradix_for_each(&c->stripes, iter, m) + if (m->alive) + bch2_stripes_heap_insert(c, m, iter.pos); +} + -+static int __bch2_stripe_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct stripe *m, -+ size_t idx, -+ struct bkey_i_stripe *new_key) -+{ -+ const struct bch_stripe *v; -+ struct bkey_s_c k; -+ unsigned i; -+ int ret; -+ -+ bch2_btree_iter_set_pos(iter, POS(0, idx)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_stripe) -+ return -EIO; -+ -+ v = bkey_s_c_to_stripe(k).v; -+ for (i = 0; i < v->nr_blocks; i++) -+ if (m->block_sectors[i] != stripe_blockcount_get(v, i)) -+ goto write; -+ return 0; -+write: -+ bkey_reassemble(&new_key->k_i, k); -+ -+ for (i = 0; i < new_key->v.nr_blocks; i++) -+ stripe_blockcount_set(&new_key->v, i, -+ m->block_sectors[i]); -+ -+ return bch2_trans_update(trans, iter, &new_key->k_i, 0); -+} -+ -+int bch2_stripes_write(struct bch_fs *c, unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct genradix_iter giter; -+ struct bkey_i_stripe *new_key; -+ struct stripe *m; -+ int ret = 0; -+ -+ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); -+ BUG_ON(!new_key); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ genradix_for_each(&c->stripes[0], giter, m) { -+ if (!m->alive) -+ continue; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|flags, -+ __bch2_stripe_write_key(&trans, &iter, m, -+ giter.pos, new_key)); -+ -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); -+ -+ kfree(new_key); -+ -+ return ret; -+} -+ -+static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_stripe) -+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: -+ bch2_mark_key(trans, k, -+ BTREE_TRIGGER_NOATOMIC); -+ -+ return ret; -+} -+ +int bch2_stripes_read(struct bch_fs *c) +{ + struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ const struct bch_stripe *s; ++ struct stripe *m; ++ unsigned i; + int ret; + + bch2_trans_init(&trans, c, 0, 0); -+ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, -+ bch2_stripes_read_fn); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->type != KEY_TYPE_stripe) ++ continue; ++ ++ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); ++ if (ret) ++ break; ++ ++ s = bkey_s_c_to_stripe(k).v; ++ ++ m = genradix_ptr(&c->stripes, k.k->p.offset); ++ m->alive = true; ++ m->sectors = le16_to_cpu(s->sectors); ++ m->algorithm = s->algorithm; ++ m->nr_blocks = s->nr_blocks; ++ m->nr_redundant = s->nr_redundant; ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ m->blocks_nonempty += !!stripe_blockcount_get(s, i); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_update(c, m, k.k->p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ + bch2_trans_exit(&trans); ++ + if (ret) + bch_err(c, "error reading stripes: %i", ret); + + return ret; +} + -+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ size_t i, idx = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0); -+ -+ k = bch2_btree_iter_prev(&iter); -+ ret = bkey_err(k); -+ if (!ret && k.k) -+ idx = k.k->p.offset + 1; -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ if (ret) -+ return ret; -+ -+ if (!idx) -+ return 0; -+ -+ if (!gc && -+ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), -+ GFP_KERNEL)) -+ return -ENOMEM; -+#if 0 -+ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); -+#else -+ for (i = 0; i < idx; i++) -+ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) -+ return -ENOMEM; -+#endif -+ return 0; -+} -+ +void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; @@ -35459,7 +37306,7 @@ index 000000000000..bca1b8a7b673 + + spin_lock(&c->ec_stripes_heap_lock); + for (i = 0; i < min_t(size_t, h->used, 20); i++) { -+ m = genradix_ptr(&c->stripes[0], h->data[i].idx); ++ m = genradix_ptr(&c->stripes, h->data[i].idx); + + pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, + h->data[i].blocks_nonempty, @@ -35517,7 +37364,7 @@ index 000000000000..bca1b8a7b673 + BUG_ON(!list_empty(&c->ec_stripe_new_list)); + + free_heap(&c->ec_stripes_heap); -+ genradix_free(&c->stripes[0]); ++ genradix_free(&c->stripes); + bioset_exit(&c->ec_bioset); +} + @@ -35531,10 +37378,10 @@ index 000000000000..bca1b8a7b673 +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 -index 000000000000..eb16e140e2c8 +index 000000000000..9d508a2f3bbc --- /dev/null +++ b/fs/bcachefs/ec.h -@@ -0,0 +1,229 @@ +@@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H @@ -35551,6 +37398,8 @@ index 000000000000..eb16e140e2c8 + .key_invalid = bch2_stripe_invalid, \ + .val_to_text = bch2_stripe_to_text, \ + .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_stripe, \ ++ .atomic_trigger = bch2_mark_stripe, \ +} + +static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) @@ -35645,7 +37494,7 @@ index 000000000000..eb16e140e2c8 + le16_to_cpu(s->sectors)); +} + -+static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m, ++static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, + struct extent_ptr_decoded p) +{ + unsigned nr_data = m->nr_blocks - m->nr_redundant; @@ -35753,9 +37602,6 @@ index 000000000000..eb16e140e2c8 +void bch2_stripes_heap_start(struct bch_fs *); + +int bch2_stripes_read(struct bch_fs *); -+int bch2_stripes_write(struct bch_fs *, unsigned); -+ -+int bch2_ec_mem_alloc(struct bch_fs *, bool); + +void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); +void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); @@ -35766,10 +37612,10 @@ index 000000000000..eb16e140e2c8 +#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h new file mode 100644 -index 000000000000..3fc31222459a +index 000000000000..edd93da663c1 --- /dev/null +++ b/fs/bcachefs/ec_types.h -@@ -0,0 +1,37 @@ +@@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_TYPES_H +#define _BCACHEFS_EC_TYPES_H @@ -35793,6 +37639,15 @@ index 000000000000..3fc31222459a + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ + unsigned on_heap:1; + u8 blocks_nonempty; ++}; ++ ++struct gc_stripe { ++ u16 sectors; ++ ++ u8 nr_blocks; ++ u8 nr_redundant; ++ ++ unsigned alive:1; /* does a corresponding key exist in stripes btree? */ + u16 block_sectors[BCH_BKEY_PTRS_MAX]; + struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; + @@ -35807,9 +37662,27 @@ index 000000000000..3fc31222459a +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; + +#endif /* _BCACHEFS_EC_TYPES_H */ +diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h +new file mode 100644 +index 000000000000..f7d12915c1cc +--- /dev/null ++++ b/fs/bcachefs/errcode.h +@@ -0,0 +1,12 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ERRCODE_H ++#define _BCACHEFS_ERRCODE_H ++ ++enum { ++ /* Bucket allocator: */ ++ OPEN_BUCKETS_EMPTY = 2048, ++ FREELIST_EMPTY, /* Allocator thread not keeping up */ ++ INSUFFICIENT_DEVICES, ++}; ++ ++#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c new file mode 100644 -index 000000000000..2cea694575e9 +index 000000000000..8279a9ba76a5 --- /dev/null +++ b/fs/bcachefs/error.c @@ -0,0 +1,185 @@ @@ -35830,7 +37703,7 @@ index 000000000000..2cea694575e9 + return false; + case BCH_ON_ERROR_ro: + if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only"); ++ bch_err(c, "inconsistency detected - emergency read only"); + return true; + case BCH_ON_ERROR_panic: + panic(bch2_fmt(c, "panic after error")); @@ -35850,7 +37723,7 @@ index 000000000000..2cea694575e9 +void bch2_fatal_error(struct bch_fs *c) +{ + if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only"); ++ bch_err(c, "fatal error - emergency read only"); +} + +void bch2_io_error_work(struct work_struct *work) @@ -36000,10 +37873,10 @@ index 000000000000..2cea694575e9 +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h new file mode 100644 -index 000000000000..986938298adc +index 000000000000..6e63c38186f3 --- /dev/null +++ b/fs/bcachefs/error.h -@@ -0,0 +1,218 @@ +@@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERROR_H +#define _BCACHEFS_ERROR_H @@ -36045,7 +37918,7 @@ index 000000000000..986938298adc + +#define bch2_fs_inconsistent_on(cond, c, ...) \ +({ \ -+ int _ret = !!(cond); \ ++ bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_fs_inconsistent(c, __VA_ARGS__); \ @@ -36065,7 +37938,7 @@ index 000000000000..986938298adc + +#define bch2_dev_inconsistent_on(cond, ca, ...) \ +({ \ -+ int _ret = !!(cond); \ ++ bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_dev_inconsistent(ca, __VA_ARGS__); \ @@ -36073,6 +37946,26 @@ index 000000000000..986938298adc +}) + +/* ++ * When a transaction update discovers or is causing a fs inconsistency, it's ++ * helpful to also dump the pending updates: ++ */ ++#define bch2_trans_inconsistent(trans, ...) \ ++({ \ ++ bch_err(trans->c, __VA_ARGS__); \ ++ bch2_inconsistent_error(trans->c); \ ++ bch2_dump_trans_updates(trans); \ ++}) ++ ++#define bch2_trans_inconsistent_on(cond, trans, ...) \ ++({ \ ++ bool _ret = unlikely(!!(cond)); \ ++ \ ++ if (_ret) \ ++ bch2_trans_inconsistent(trans, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* + * Fsck errors: inconsistency errors we detect at mount time, and should ideally + * be able to repair: + */ @@ -36135,7 +38028,7 @@ index 000000000000..986938298adc +/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ + +#define __fsck_err_on(cond, c, _flags, ...) \ -+ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) ++ (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) + +#define need_fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) @@ -36170,7 +38063,7 @@ index 000000000000..986938298adc + +#define bch2_fs_fatal_err_on(cond, c, ...) \ +({ \ -+ int _ret = !!(cond); \ ++ bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_fs_fatal_error(c, __VA_ARGS__); \ @@ -36224,10 +38117,10 @@ index 000000000000..986938298adc +#endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c new file mode 100644 -index 000000000000..58b2c96f450c +index 000000000000..2fd5d9672a44 --- /dev/null +++ b/fs/bcachefs/extent_update.c -@@ -0,0 +1,169 @@ +@@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" @@ -36245,17 +38138,26 @@ index 000000000000..58b2c96f450c +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; -+ unsigned ret = 0; ++ unsigned ret = 0, lru = 0; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: ++ /* Might also be updating LRU btree */ ++ if (entry->ptr.cached) ++ lru++; ++ ++ fallthrough; + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } + -+ return ret; ++ /* ++ * Updating keys in the alloc btree may also update keys in the ++ * freespace or discard btrees: ++ */ ++ return lru + ret * 2; +} + +static int count_iters_for_insert(struct btree_trans *trans, @@ -36417,10 +38319,10 @@ index 000000000000..6f5cf449361a +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 -index 000000000000..89b5be907eea +index 000000000000..77a0d49a2372 --- /dev/null +++ b/fs/bcachefs/extents.c -@@ -0,0 +1,1249 @@ +@@ -0,0 +1,1259 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -36726,7 +38628,7 @@ index 000000000000..89b5be907eea + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + -+ rp.crc.uncompressed_size > c->sb.encoded_extent_max) ++ rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) + return false; + + if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > @@ -37377,15 +39279,25 @@ index 000000000000..89b5be907eea + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); -+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + -+ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, -+ (u64) ptr->offset, ptr->gen, -+ ptr->cached ? " cached" : "", -+ ca && ptr_stale(ca, ptr) -+ ? " stale" : ""); ++ if (!ca) { ++ pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev, ++ (u64) ptr->offset, ptr->gen, ++ ptr->cached ? " cached" : ""); ++ } else { ++ u32 offset; ++ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); ++ ++ pr_buf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev, ++ b, offset, ptr->gen, ++ ptr->cached ? " cached" : ""); ++ ++ if (ca && ptr_stale(ca, ptr)) ++ pr_buf(out, " stale"); ++ } + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: @@ -37461,7 +39373,7 @@ index 000000000000..89b5be907eea + + if (k.k->type == KEY_TYPE_btree_ptr || + k.k->type == KEY_TYPE_btree_ptr_v2) -+ size_ondisk = c->opts.btree_node_size; ++ size_ondisk = btree_sectors(c); + + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) @@ -37672,10 +39584,10 @@ index 000000000000..89b5be907eea +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h new file mode 100644 -index 000000000000..9c2567274a2b +index 000000000000..ae650849d98a --- /dev/null +++ b/fs/bcachefs/extents.h -@@ -0,0 +1,680 @@ +@@ -0,0 +1,688 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_H +#define _BCACHEFS_EXTENTS_H @@ -38059,6 +39971,8 @@ index 000000000000..9c2567274a2b + .key_invalid = bch2_btree_ptr_invalid, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ +} + +#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ @@ -38066,6 +39980,8 @@ index 000000000000..9c2567274a2b + .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ +} + +/* KEY_TYPE_extent: */ @@ -38080,6 +39996,8 @@ index 000000000000..9c2567274a2b + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_extent_normalize, \ + .key_merge = bch2_extent_merge, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ +} + +/* KEY_TYPE_reservation: */ @@ -38092,6 +40010,8 @@ index 000000000000..9c2567274a2b + .key_invalid = bch2_reservation_invalid, \ + .val_to_text = bch2_reservation_to_text, \ + .key_merge = bch2_reservation_merge, \ ++ .trans_trigger = bch2_trans_mark_reservation, \ ++ .atomic_trigger = bch2_mark_reservation, \ +} + +/* Extent checksum entries: */ @@ -38404,10 +40324,10 @@ index 000000000000..43d6c341ecca +#endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h new file mode 100644 -index 000000000000..26d5cad7e6a5 +index 000000000000..05429c9631cd --- /dev/null +++ b/fs/bcachefs/eytzinger.h -@@ -0,0 +1,285 @@ +@@ -0,0 +1,281 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _EYTZINGER_H +#define _EYTZINGER_H @@ -38427,10 +40347,6 @@ index 000000000000..26d5cad7e6a5 + * + * With one based indexing each level of the tree starts at a power of two - + * good for cacheline alignment: -+ * -+ * Size parameter is treated as if we were using 0 based indexing, however: -+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there -+ * are actually size - 1 elements + */ + +static inline unsigned eytzinger1_child(unsigned i, unsigned child) @@ -38452,12 +40368,12 @@ index 000000000000..26d5cad7e6a5 + +static inline unsigned eytzinger1_first(unsigned size) +{ -+ return rounddown_pow_of_two(size - 1); ++ return rounddown_pow_of_two(size); +} + +static inline unsigned eytzinger1_last(unsigned size) +{ -+ return rounddown_pow_of_two(size) - 1; ++ return rounddown_pow_of_two(size + 1) - 1; +} + +/* @@ -38472,13 +40388,13 @@ index 000000000000..26d5cad7e6a5 + +static inline unsigned eytzinger1_next(unsigned i, unsigned size) +{ -+ EBUG_ON(i >= size); ++ EBUG_ON(i > size); + -+ if (eytzinger1_right_child(i) < size) { ++ if (eytzinger1_right_child(i) <= size) { + i = eytzinger1_right_child(i); + -+ i <<= __fls(size) - __fls(i); -+ i >>= i >= size; ++ i <<= __fls(size + 1) - __fls(i); ++ i >>= i > size; + } else { + i >>= ffz(i) + 1; + } @@ -38488,14 +40404,14 @@ index 000000000000..26d5cad7e6a5 + +static inline unsigned eytzinger1_prev(unsigned i, unsigned size) +{ -+ EBUG_ON(i >= size); ++ EBUG_ON(i > size); + -+ if (eytzinger1_left_child(i) < size) { ++ if (eytzinger1_left_child(i) <= size) { + i = eytzinger1_left_child(i) + 1; + -+ i <<= __fls(size) - __fls(i); ++ i <<= __fls(size + 1) - __fls(i); + i -= 1; -+ i >>= i >= size; ++ i >>= i > size; + } else { + i >>= __ffs(i) + 1; + } @@ -38505,17 +40421,17 @@ index 000000000000..26d5cad7e6a5 + +static inline unsigned eytzinger1_extra(unsigned size) +{ -+ return (size - rounddown_pow_of_two(size - 1)) << 1; ++ return (size + 1 - rounddown_pow_of_two(size)) << 1; +} + +static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, + unsigned extra) +{ + unsigned b = __fls(i); -+ unsigned shift = __fls(size - 1) - b; ++ unsigned shift = __fls(size) - b; + int s; + -+ EBUG_ON(!i || i >= size); ++ EBUG_ON(!i || i > size); + + i ^= 1U << b; + i <<= 1; @@ -38540,7 +40456,7 @@ index 000000000000..26d5cad7e6a5 + unsigned shift; + int s; + -+ EBUG_ON(!i || i >= size); ++ EBUG_ON(!i || i > size); + + /* + * sign bit trick: @@ -38554,7 +40470,7 @@ index 000000000000..26d5cad7e6a5 + shift = __ffs(i); + + i >>= shift + 1; -+ i |= 1U << (__fls(size - 1) - shift); ++ i |= 1U << (__fls(size) - shift); + + return i; +} @@ -38595,39 +40511,39 @@ index 000000000000..26d5cad7e6a5 + +static inline unsigned eytzinger0_first(unsigned size) +{ -+ return eytzinger1_first(size + 1) - 1; ++ return eytzinger1_first(size) - 1; +} + +static inline unsigned eytzinger0_last(unsigned size) +{ -+ return eytzinger1_last(size + 1) - 1; ++ return eytzinger1_last(size) - 1; +} + +static inline unsigned eytzinger0_next(unsigned i, unsigned size) +{ -+ return eytzinger1_next(i + 1, size + 1) - 1; ++ return eytzinger1_next(i + 1, size) - 1; +} + +static inline unsigned eytzinger0_prev(unsigned i, unsigned size) +{ -+ return eytzinger1_prev(i + 1, size + 1) - 1; ++ return eytzinger1_prev(i + 1, size) - 1; +} + +static inline unsigned eytzinger0_extra(unsigned size) +{ -+ return eytzinger1_extra(size + 1); ++ return eytzinger1_extra(size); +} + +static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, + unsigned extra) +{ -+ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; ++ return __eytzinger1_to_inorder(i + 1, size, extra) - 1; +} + +static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, + unsigned extra) +{ -+ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; ++ return __inorder_to_eytzinger1(i + 1, size, extra) - 1; +} + +static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) @@ -38828,10 +40744,10 @@ index 000000000000..cdb272708a4b +#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 -index 000000000000..5f3429e99115 +index 000000000000..d543480be111 --- /dev/null +++ b/fs/bcachefs/fs-common.c -@@ -0,0 +1,493 @@ +@@ -0,0 +1,494 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -39163,6 +41079,7 @@ index 000000000000..5f3429e99115 + bool ret = false; + + for (id = 0; id < Inode_opt_nr; id++) { ++ /* Skip attributes that were explicitly set on this inode */ + if (dst_u->bi_fields_set & (1 << id)) + continue; + @@ -39376,10 +41293,10 @@ index 000000000000..dde237859514 +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c new file mode 100644 -index 000000000000..25643c71ec91 +index 000000000000..4004fa5c1cc9 --- /dev/null +++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,3427 @@ +@@ -0,0 +1,3495 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -39799,6 +41716,110 @@ index 000000000000..25643c71ec91 + bv.bv_len >> 9, nr_ptrs, state); +} + ++static void mark_pagecache_unallocated(struct bch_inode_info *inode, ++ u64 start, u64 end) ++{ ++ pgoff_t index = start >> PAGE_SECTORS_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; ++ struct pagevec pvec; ++ ++ if (end <= start) ++ return; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ u64 pg_start = page->index << PAGE_SECTORS_SHIFT; ++ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; ++ unsigned pg_offset = max(start, pg_start) - pg_start; ++ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; ++ struct bch_page_state *s; ++ ++ BUG_ON(end <= pg_start); ++ BUG_ON(pg_offset >= PAGE_SECTORS); ++ BUG_ON(pg_offset + pg_len > PAGE_SECTORS); ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = pg_offset; j < pg_offset + pg_len; j++) ++ s->s[j].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++} ++ ++static void mark_pagecache_reserved(struct bch_inode_info *inode, ++ u64 start, u64 end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ pgoff_t index = start >> PAGE_SECTORS_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; ++ struct pagevec pvec; ++ s64 i_sectors_delta = 0; ++ ++ if (end <= start) ++ return; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ u64 pg_start = page->index << PAGE_SECTORS_SHIFT; ++ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; ++ unsigned pg_offset = max(start, pg_start) - pg_start; ++ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; ++ struct bch_page_state *s; ++ ++ BUG_ON(end <= pg_start); ++ BUG_ON(pg_offset >= PAGE_SECTORS); ++ BUG_ON(pg_offset + pg_len > PAGE_SECTORS); ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = pg_offset; j < pg_offset + pg_len; j++) ++ switch (s->s[j].state) { ++ case SECTOR_UNALLOCATED: ++ s->s[j].state = SECTOR_RESERVED; ++ break; ++ case SECTOR_DIRTY: ++ s->s[j].state = SECTOR_DIRTY_RESERVED; ++ i_sectors_delta--; ++ break; ++ default: ++ break; ++ } ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++ ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++} ++ +static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) +{ + /* XXX: this should not be open coded */ @@ -39954,8 +41975,7 @@ index 000000000000..25643c71ec91 + + bch2_disk_reservation_put(c, &disk_res); + -+ if (dirty_sectors) -+ i_sectors_acct(c, inode, NULL, dirty_sectors); ++ i_sectors_acct(c, inode, NULL, dirty_sectors); + + bch2_page_state_release(page); +} @@ -40003,8 +42023,7 @@ index 000000000000..25643c71ec91 + + spin_unlock(&s->lock); + -+ if (dirty_sectors) -+ i_sectors_acct(c, inode, &res->quota, dirty_sectors); ++ i_sectors_acct(c, inode, &res->quota, dirty_sectors); + + if (!PageDirty(page)) + __set_page_dirty_nobuffers(page); @@ -40304,7 +42323,7 @@ index 000000000000..25643c71ec91 + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), -+ BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); ++ BTREE_ITER_SLOTS); + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; @@ -40342,8 +42361,6 @@ index 000000000000..25643c71ec91 + + sectors = min(sectors, k.k->size - offset_into_extent); + -+ bch2_trans_unlock(trans); -+ + if (readpages_iter) + readpage_bio_extend(readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); @@ -40560,7 +42577,7 @@ index 000000000000..25643c71ec91 + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: + */ -+ BUG_ON(io->op.i_sectors_delta > 0); ++ WARN_ON_ONCE(io->op.i_sectors_delta > 0); + + /* + * (error (due to going RO) halfway through a page can screw that up @@ -40746,8 +42763,8 @@ index 000000000000..25643c71ec91 + sectors << 9, offset << 9)); + + /* Check for writing past i_size: */ -+ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > -+ round_up(i_size, block_bytes(c))); ++ WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > ++ round_up(i_size, block_bytes(c))); + + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; @@ -41201,7 +43218,7 @@ index 000000000000..25643c71ec91 + iter->count -= shorten; + + bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_VECS), ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + &c->dio_read_bioset); + + bio->bi_end_io = bch2_direct_IO_read_endio; @@ -41236,7 +43253,7 @@ index 000000000000..25643c71ec91 + goto start; + while (iter->count) { + bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_VECS), ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + &c->bio_read); + bio->bi_end_io = bch2_direct_IO_read_split_endio; +start: @@ -41383,7 +43400,7 @@ index 000000000000..25643c71ec91 + while (1) { + iter_count = dio->iter.count; + -+ if (kthread) ++ if (kthread && dio->mm) + kthread_use_mm(dio->mm); + BUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; @@ -41393,7 +43410,7 @@ index 000000000000..25643c71ec91 + dropped_locks = fdm_dropped_locks(); + + current->faults_disabled_mapping = NULL; -+ if (kthread) ++ if (kthread && dio->mm) + kthread_unuse_mm(dio->mm); + + /* @@ -41586,9 +43603,7 @@ index 000000000000..25643c71ec91 + } + + bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_is_bvec(iter) -+ ? 0 -+ : iov_iter_npages(iter, BIO_MAX_VECS), ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + &c->dio_write_bioset); + dio = container_of(bio, struct dio_write, op.wbio.bio); + init_completion(&dio->done); @@ -41985,6 +44000,9 @@ index 000000000000..25643c71ec91 + U64_MAX, &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + ++ WARN_ON(!inode->v.i_size && inode->v.i_blocks && ++ !bch2_journal_error(&c->journal)); ++ + if (unlikely(ret)) + goto err; + @@ -42325,6 +44343,9 @@ index 000000000000..25643c71ec91 + ret = 0; + } + ++ bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ ++ mark_pagecache_reserved(inode, start_sector, iter.pos.offset); ++ + if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; @@ -42430,43 +44451,6 @@ index 000000000000..25643c71ec91 + return ret; +} + -+static void mark_range_unallocated(struct bch_inode_info *inode, -+ loff_t start, loff_t end) -+{ -+ pgoff_t index = start >> PAGE_SHIFT; -+ pgoff_t end_index = (end - 1) >> PAGE_SHIFT; -+ struct pagevec pvec; -+ -+ pagevec_init(&pvec); -+ -+ do { -+ unsigned nr_pages, i, j; -+ -+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, -+ &index, end_index); -+ if (nr_pages == 0) -+ break; -+ -+ for (i = 0; i < nr_pages; i++) { -+ struct page *page = pvec.pages[i]; -+ struct bch_page_state *s; -+ -+ lock_page(page); -+ s = bch2_page_state(page); -+ -+ if (s) { -+ spin_lock(&s->lock); -+ for (j = 0; j < PAGE_SECTORS; j++) -+ s->s[j].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ -+ unlock_page(page); -+ } -+ pagevec_release(&pvec); -+ } while (index <= end_index); -+} -+ +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) @@ -42512,7 +44496,8 @@ index 000000000000..25643c71ec91 + if (ret) + goto err; + -+ mark_range_unallocated(src, pos_src, pos_src + aligned_len); ++ mark_pagecache_unallocated(src, pos_src >> 9, ++ (pos_src + aligned_len) >> 9); + + ret = bch2_remap_range(c, + inode_inum(dst), pos_dst >> 9, @@ -43488,10 +45473,10 @@ index 000000000000..f201980ef2c3 +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 -index 000000000000..7eb33da9c253 +index 000000000000..d462c06899d6 --- /dev/null +++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1935 @@ +@@ -0,0 +1,1940 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -43524,6 +45509,7 @@ index 000000000000..7eb33da9c253 +#include +#include +#include ++#include +#include +#include +#include @@ -43598,7 +45584,7 @@ index 000000000000..7eb33da9c253 + + bch2_assert_pos_locked(trans, BTREE_ID_inodes, + POS(0, bi->bi_inum), -+ 0 && c->opts.inodes_use_key_cache); ++ c->opts.inodes_use_key_cache); + + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); + i_uid_write(&inode->v, bi->bi_uid); @@ -43628,7 +45614,6 @@ index 000000000000..7eb33da9c253 + int ret; + + bch2_trans_init(&trans, c, 0, 512); -+ trans.ip = _RET_IP_; +retry: + bch2_trans_begin(&trans); + @@ -44360,8 +46345,8 @@ index 000000000000..7eb33da9c253 + else + offset += p.crc.offset; + -+ if ((offset & (c->opts.block_size - 1)) || -+ (k.k->size & (c->opts.block_size - 1))) ++ if ((offset & (block_sectors(c) - 1)) || ++ (k.k->size & (block_sectors(c) - 1))) + flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; + + ret = fiemap_fill_next_extent(info, @@ -44428,9 +46413,9 @@ index 000000000000..7eb33da9c253 + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(ei->v.i_ino, start, snapshot), 0); + -+ while ((k = bch2_btree_iter_peek(&iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(iter.pos, end) < 0) { ++ while (!(ret = btree_trans_too_many_iters(&trans)) && ++ (k = bch2_btree_iter_peek_upto(&iter, end)).k && ++ !(ret = bkey_err(k))) { + enum btree_id data_btree = BTREE_ID_extents; + + if (!bkey_extent_is_data(k.k) && @@ -44966,12 +46951,12 @@ index 000000000000..7eb33da9c253 + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); -+ bch2_inode_rm(c, inode_inum(inode), true); ++ bch2_inode_rm(c, inode_inum(inode)); + } +} + +void bch2_evict_subvolume_inodes(struct bch_fs *c, -+ struct snapshot_id_list *s) ++ snapshot_id_list *s) +{ + struct super_block *sb = c->vfs_sb; + struct inode *inode; @@ -45169,25 +47154,30 @@ index 000000000000..7eb33da9c253 +{ + struct bch_fs *c = root->d_sb->s_fs_info; + enum bch_opt_id i; -+ char buf[512]; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; + + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + -+ if (!(opt->mode & OPT_MOUNT)) ++ if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + -+ bch2_opt_to_text(&PBUF(buf), c, opt, v, ++ printbuf_reset(&buf); ++ bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, + OPT_SHOW_MOUNT_STYLE); + seq_putc(seq, ','); -+ seq_puts(seq, buf); ++ seq_puts(seq, buf.buf); + } + -+ return 0; ++ if (buf.allocation_failure) ++ ret = -ENOMEM; ++ printbuf_exit(&buf); ++ return ret; +} + +static void bch2_put_super(struct super_block *sb) @@ -45429,7 +47419,7 @@ index 000000000000..7eb33da9c253 +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h new file mode 100644 -index 000000000000..b2211ec7f302 +index 000000000000..9f4b57e30e2a --- /dev/null +++ b/fs/bcachefs/fs.h @@ -0,0 +1,208 @@ @@ -45626,7 +47616,7 @@ index 000000000000..b2211ec7f302 + struct iattr *); +int __bch2_unlink(struct inode *, struct dentry *, bool); + -+void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); ++void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); + +void bch2_vfs_exit(void); +int bch2_vfs_init(void); @@ -45634,7 +47624,7 @@ index 000000000000..b2211ec7f302 +#else + +static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, -+ struct snapshot_id_list *s) {} ++ snapshot_id_list *s) {} +static inline void bch2_vfs_exit(void) {} +static inline int bch2_vfs_init(void) { return 0; } + @@ -45643,15 +47633,16 @@ index 000000000000..b2211ec7f302 +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 -index 000000000000..361dbf338023 +index 000000000000..2582ddf14803 --- /dev/null +++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,2345 @@ +@@ -0,0 +1,2356 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" ++#include "darray.h" +#include "dirent.h" +#include "error.h" +#include "fs-common.h" @@ -46120,11 +48111,11 @@ index 000000000000..361dbf338023 + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + + if (bkey_cmp(s->pos, pos)) -+ s->nr = 0; ++ s->ids.nr = 0; + s->pos = pos; + + /* Might get called multiple times due to lock restarts */ -+ if (s->nr && s->d[s->nr - 1] == pos.snapshot) ++ if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot) + return 0; + + return snapshots_seen_add(c, s, pos.snapshot); @@ -46147,7 +48138,7 @@ index 000000000000..361dbf338023 + ancestor = snapshot_t(c, ancestor)->equiv; + + /* @ancestor should be the snapshot most recently added to @seen */ -+ BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); ++ BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor); + BUG_ON(seen->pos.snapshot != ancestor); + + if (id == ancestor) @@ -46156,11 +48147,11 @@ index 000000000000..361dbf338023 + if (!bch2_snapshot_is_ancestor(c, id, ancestor)) + return false; + -+ for (i = seen->nr - 2; -+ i >= 0 && seen->d[i] >= id; ++ for (i = seen->ids.nr - 2; ++ i >= 0 && seen->ids.data[i] >= id; + --i) -+ if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && -+ bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) ++ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) && ++ bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor)) + return false; + + return true; @@ -46186,26 +48177,25 @@ index 000000000000..361dbf338023 +} + +#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ -+ for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ ++ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\ + if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + ++struct inode_walker_entry { ++ struct bch_inode_unpacked inode; ++ u32 snapshot; ++ u64 count; ++}; ++ +struct inode_walker { + bool first_this_inode; + u64 cur_inum; + -+ size_t nr; -+ size_t size; -+ struct inode_walker_entry { -+ struct bch_inode_unpacked inode; -+ u32 snapshot; -+ u64 count; -+ } *d; ++ DARRAY(struct inode_walker_entry) inodes; +}; + +static void inode_walker_exit(struct inode_walker *w) +{ -+ kfree(w->d); -+ w->d = NULL; ++ darray_exit(w->inodes); +} + +static struct inode_walker inode_walker_init(void) @@ -46213,40 +48203,17 @@ index 000000000000..361dbf338023 + return (struct inode_walker) { 0, }; +} + -+static int inode_walker_realloc(struct inode_walker *w) -+{ -+ if (w->nr == w->size) { -+ size_t new_size = max_t(size_t, 8UL, w->size * 2); -+ void *d = krealloc(w->d, new_size * sizeof(w->d[0]), -+ GFP_KERNEL); -+ if (!d) -+ return -ENOMEM; -+ -+ w->d = d; -+ w->size = new_size; -+ } -+ -+ return 0; -+} -+ +static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bkey_s_c inode) +{ + struct bch_inode_unpacked u; -+ int ret; -+ -+ ret = inode_walker_realloc(w); -+ if (ret) -+ return ret; + + BUG_ON(bch2_inode_unpack(inode, &u)); + -+ w->d[w->nr++] = (struct inode_walker_entry) { ++ return darray_push(w->inodes, ((struct inode_walker_entry) { + .inode = u, + .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, -+ }; -+ -+ return 0; ++ })); +} + +static int __walk_inode(struct btree_trans *trans, @@ -46265,7 +48232,7 @@ index 000000000000..361dbf338023 + goto lookup_snapshot; + } + -+ w->nr = 0; ++ w->inodes.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -46283,26 +48250,25 @@ index 000000000000..361dbf338023 + w->cur_inum = pos.inode; + w->first_this_inode = true; +lookup_snapshot: -+ for (i = 0; i < w->nr; i++) -+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) ++ for (i = 0; i < w->inodes.nr; i++) ++ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot)) + goto found; + return INT_MAX; +found: -+ BUG_ON(pos.snapshot > w->d[i].snapshot); ++ BUG_ON(pos.snapshot > w->inodes.data[i].snapshot); + -+ if (pos.snapshot != w->d[i].snapshot) { ++ if (pos.snapshot != w->inodes.data[i].snapshot) { + ancestor_pos = i; + -+ while (i && w->d[i - 1].snapshot > pos.snapshot) ++ while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) + --i; + -+ ret = inode_walker_realloc(w); ++ ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]); + if (ret) + return ret; + -+ array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); -+ w->d[i].snapshot = pos.snapshot; -+ w->d[i].count = 0; ++ w->inodes.data[i].snapshot = pos.snapshot; ++ w->inodes.data[i].count = 0; + } + + return i; @@ -46318,7 +48284,7 @@ index 000000000000..361dbf338023 + struct bkey_s_c k; + int ret; + -+ w->nr = 0; ++ w->inodes.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -46344,15 +48310,16 @@ index 000000000000..361dbf338023 + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, + "key in missing snapshot: %s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) -+ return bch2_btree_delete_at(trans, iter, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ++ ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; +fsck_err: ++ printbuf_exit(&buf); + return ret; +} + @@ -46392,7 +48359,7 @@ index 000000000000..361dbf338023 +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + struct bkey_s_c k; + u64 hash; + int ret = 0; @@ -46416,8 +48383,9 @@ index 000000000000..361dbf338023 + if (fsck_err_on(k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k), c, + "duplicate hash table keys:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ hash_k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, hash_k), ++ buf.buf))) { + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; + break; + } @@ -46428,13 +48396,16 @@ index 000000000000..361dbf338023 + } + + } ++out: + bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); + return ret; +bad_hash: + if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " + "hashed to %llu\n%s", + desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE) + return 0; + + ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); @@ -46442,9 +48413,9 @@ index 000000000000..361dbf338023 + bch_err(c, "hash_redo_key err %i", ret); + return ret; + } -+ return -EINTR; ++ ret = -EINTR; +fsck_err: -+ return ret; ++ goto out; +} + +static int check_inode(struct btree_trans *trans, @@ -46774,7 +48745,7 @@ index 000000000000..361dbf338023 + int ret = 0, ret2 = 0; + s64 count2; + -+ for (i = w->d; i < w->d + w->nr; i++) { ++ darray_for_each(w->inodes, i) { + if (i->inode.bi_sectors == i->count) + continue; + @@ -46812,32 +48783,34 @@ index 000000000000..361dbf338023 + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct inode_walker_entry *i; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + k = bch2_btree_iter_peek(iter); + if (!k.k) -+ return 0; ++ goto out; + + ret = bkey_err(k); + if (ret) -+ return ret; ++ goto err; + + ret = check_key_has_snapshot(trans, iter, k); -+ if (ret) -+ return ret < 0 ? ret : 0; ++ if (ret) { ++ ret = ret < 0 ? ret : 0; ++ goto out; ++ } + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) -+ return ret; ++ goto err; + + if (k.k->type == KEY_TYPE_whiteout) -+ return 0; ++ goto out; + + if (inode->cur_inum != k.k->p.inode) { + ret = check_i_sectors(trans, inode); + if (ret) -+ return ret; ++ goto err; + } +#if 0 + if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { @@ -46847,33 +48820,43 @@ index 000000000000..361dbf338023 + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); + bch2_bkey_val_to_text(&PBUF(buf2), c, k); + -+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) -+ return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { ++ ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; ++ goto out; ++ } + } +#endif + ret = __walk_inode(trans, inode, k.k->p); + if (ret < 0) -+ return ret; ++ goto err; + + if (fsck_err_on(ret == INT_MAX, c, + "extent in missing inode:\n %s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) -+ return bch2_btree_delete_at(trans, iter, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } + -+ if (ret == INT_MAX) -+ return 0; ++ if (ret == INT_MAX) { ++ ret = 0; ++ goto out; ++ } + -+ i = inode->d + ret; ++ i = inode->inodes.data + ret; + ret = 0; + + if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && + !S_ISLNK(i->inode.bi_mode), c, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) -+ return bch2_btree_delete_at(trans, iter, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } + + if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { @@ -46883,11 +48866,12 @@ index 000000000000..361dbf338023 + "extent type %u offset %llu past end of inode %llu, i_size %llu", + k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { + bch2_fs_lazy_rw(c); -+ return bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, + k.k->p.snapshot), + POS(k.k->p.inode, U64_MAX), + 0, NULL) ?: -EINTR; ++ goto out; + } + } + } @@ -46899,7 +48883,10 @@ index 000000000000..361dbf338023 + bch2_bkey_buf_reassemble(&prev, c, k); +#endif + ++out: ++err: +fsck_err: ++ printbuf_exit(&buf); + return ret; +} + @@ -46958,12 +48945,13 @@ index 000000000000..361dbf338023 + int ret = 0, ret2 = 0; + s64 count2; + -+ for (i = w->d; i < w->d + w->nr; i++) { ++ darray_for_each(w->inodes, i) { + if (i->inode.bi_nlink == i->count) + continue; + -+ count2 = lockrestart_do(trans, -+ bch2_count_subdirs(trans, w->cur_inum, i->snapshot)); ++ count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot); ++ if (count2 < 0) ++ return count2; + + if (i->count != count2) { + bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", @@ -46996,7 +48984,7 @@ index 000000000000..361dbf338023 + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + bool backpointer_exists = true; -+ char buf[200]; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!target->bi_dir && @@ -47022,9 +49010,7 @@ index 000000000000..361dbf338023 + "directory %llu with multiple links", + target->bi_inum)) { + ret = __remove_dirent(trans, d.k->p); -+ if (ret) -+ goto err; -+ return 0; ++ goto out; + } + + if (fsck_err_on(backpointer_exists && @@ -47061,18 +49047,19 @@ index 000000000000..361dbf338023 + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), -+ (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) -+ return ret; ++ goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) -+ return ret; ++ goto err; + + d = dirent_i_to_s_c(n); + } @@ -47086,19 +49073,21 @@ index 000000000000..361dbf338023 + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) -+ return ret; ++ goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) -+ return ret; ++ goto err; + + d = dirent_i_to_s_c(n); + } ++out: +err: +fsck_err: ++ printbuf_exit(&buf); + return ret; +} + @@ -47112,68 +49101,81 @@ index 000000000000..361dbf338023 + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct inode_walker_entry *i; -+ char buf[200]; -+ int ret; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; + + k = bch2_btree_iter_peek(iter); + if (!k.k) -+ return 0; ++ goto out; + + ret = bkey_err(k); + if (ret) -+ return ret; ++ goto err; + + ret = check_key_has_snapshot(trans, iter, k); -+ if (ret) -+ return ret < 0 ? ret : 0; ++ if (ret) { ++ ret = ret < 0 ? ret : 0; ++ goto out; ++ } + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) -+ return ret; ++ goto err; + + if (k.k->type == KEY_TYPE_whiteout) -+ return 0; ++ goto out; + + if (dir->cur_inum != k.k->p.inode) { + ret = check_subdir_count(trans, dir); + if (ret) -+ return ret; ++ goto err; + } + + ret = __walk_inode(trans, dir, k.k->p); + if (ret < 0) -+ return ret; ++ goto err; + + if (fsck_err_on(ret == INT_MAX, c, + "dirent in nonexisting directory:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) -+ return bch2_btree_delete_at(trans, iter, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } + -+ if (ret == INT_MAX) -+ return 0; ++ if (ret == INT_MAX) { ++ ret = 0; ++ goto out; ++ } + -+ i = dir->d + ret; ++ i = dir->inodes.data + ret; + ret = 0; + + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, + "dirent in non directory inode type %s:\n%s", + bch2_d_type_str(inode_d_type(&i->inode)), -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) -+ return bch2_btree_delete_at(trans, iter, 0); ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, 0); ++ goto out; ++ } + + if (dir->first_this_inode) -+ *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); ++ *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); + + ret = hash_check_key(trans, bch2_dirent_hash_desc, + hash_info, iter, k); + if (ret < 0) -+ return ret; -+ if (ret) /* dirent has been deleted */ -+ return 0; ++ goto err; ++ if (ret) { ++ /* dirent has been deleted */ ++ ret = 0; ++ goto out; ++ } + + if (k.k->type != KEY_TYPE_dirent) -+ return 0; ++ goto out; + + d = bkey_s_c_to_dirent(k); + @@ -47186,24 +49188,27 @@ index 000000000000..361dbf338023 + ret = __subvol_lookup(trans, target_subvol, + &target_snapshot, &target_inum); + if (ret && ret != -ENOENT) -+ return ret; ++ goto err; + + if (fsck_err_on(ret, c, + "dirent points to missing subvolume %llu", -+ le64_to_cpu(d.v->d_child_subvol))) -+ return __remove_dirent(trans, d.k->p); ++ le64_to_cpu(d.v->d_child_subvol))) { ++ ret = __remove_dirent(trans, d.k->p); ++ goto err; ++ } + + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && ret != -ENOENT) -+ return ret; ++ goto err; + + if (fsck_err_on(ret, c, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); -+ return -EINVAL; ++ ret = -EINVAL; ++ goto err; + } + + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, @@ -47213,32 +49218,33 @@ index 000000000000..361dbf338023 + subvol_root.bi_subvol = target_subvol; + ret = __write_inode(trans, &subvol_root, target_snapshot); + if (ret) -+ return ret; ++ goto err; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) -+ return ret; ++ goto err; + } else { + ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + if (ret) -+ return ret; ++ goto err; + -+ if (fsck_err_on(!target->nr, c, ++ if (fsck_err_on(!target->inodes.nr, c, + "dirent points to missing inode:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), ++ buf.buf))) { + ret = __remove_dirent(trans, d.k->p); + if (ret) -+ return ret; ++ goto err; + } + -+ for (i = target->d; i < target->d + target->nr; i++) { ++ darray_for_each(target->inodes, i) { + ret = check_dirent_target(trans, iter, d, + &i->inode, i->snapshot); + if (ret) -+ return ret; ++ goto err; + } + } + @@ -47246,7 +49252,10 @@ index 000000000000..361dbf338023 + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + i->count++; + ++out: ++err: +fsck_err: ++ printbuf_exit(&buf); + return ret; +} + @@ -47329,7 +49338,7 @@ index 000000000000..361dbf338023 + ret = 0; + + if (inode->first_this_inode) -+ *hash_info = bch2_hash_info_init(c, &inode->d[0].inode); ++ *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); + + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); +fsck_err: @@ -47439,21 +49448,18 @@ index 000000000000..361dbf338023 + check_root_trans(&trans)); +} + -+struct pathbuf { -+ size_t nr; -+ size_t size; -+ -+ struct pathbuf_entry { -+ u64 inum; -+ u32 snapshot; -+ } *entries; ++struct pathbuf_entry { ++ u64 inum; ++ u32 snapshot; +}; + -+static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) ++typedef DARRAY(struct pathbuf_entry) pathbuf; ++ ++static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) +{ + struct pathbuf_entry *i; + -+ for (i = p->entries; i < p->entries + p->nr; i++) ++ darray_for_each(*p, i) + if (i->inum == inum && + i->snapshot == snapshot) + return true; @@ -47461,26 +49467,18 @@ index 000000000000..361dbf338023 + return false; +} + -+static int path_down(struct pathbuf *p, u64 inum, u32 snapshot) ++static int path_down(struct bch_fs *c, pathbuf *p, ++ u64 inum, u32 snapshot) +{ -+ if (p->nr == p->size) { -+ size_t new_size = max_t(size_t, 256UL, p->size * 2); -+ void *n = krealloc(p->entries, -+ new_size * sizeof(p->entries[0]), -+ GFP_KERNEL); -+ if (!n) { -+ return -ENOMEM; -+ } -+ -+ p->entries = n; -+ p->size = new_size; -+ }; -+ -+ p->entries[p->nr++] = (struct pathbuf_entry) { ++ int ret = darray_push(*p, ((struct pathbuf_entry) { + .inum = inum, + .snapshot = snapshot, -+ }; -+ return 0; ++ })); ++ ++ if (ret) ++ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", ++ p->size); ++ return ret; +} + +/* @@ -47489,7 +49487,7 @@ index 000000000000..361dbf338023 + * XXX: we should also be verifying that inodes are in the right subvolumes + */ +static int check_path(struct btree_trans *trans, -+ struct pathbuf *p, ++ pathbuf *p, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ @@ -47542,7 +49540,7 @@ index 000000000000..361dbf338023 + if (!S_ISDIR(inode->bi_mode)) + break; + -+ ret = path_down(p, inode->bi_inum, snapshot); ++ ret = path_down(c, p, inode->bi_inum, snapshot); + if (ret) { + bch_err(c, "memory allocation failure"); + return ret; @@ -47563,7 +49561,7 @@ index 000000000000..361dbf338023 + /* XXX print path */ + bch_err(c, "directory structure loop"); + -+ for (i = p->entries; i < p->entries + p->nr; i++) ++ darray_for_each(*p, i) + pr_err("%llu:%u", i->inum, i->snapshot); + pr_err("%llu:%u", inode->bi_inum, snapshot); + @@ -47600,7 +49598,7 @@ index 000000000000..361dbf338023 + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked u; -+ struct pathbuf path = { 0, 0, NULL }; ++ pathbuf path = { 0, }; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); @@ -47630,7 +49628,7 @@ index 000000000000..361dbf338023 + + BUG_ON(ret == -EINTR); + -+ kfree(path.entries); ++ darray_exit(path); + + bch2_trans_exit(&trans); + return ret; @@ -47647,12 +49645,15 @@ index 000000000000..361dbf338023 + } *d; +}; + -+static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot) ++static int add_nlink(struct bch_fs *c, struct nlink_table *t, ++ u64 inum, u32 snapshot) +{ + if (t->nr == t->size) { + size_t new_size = max_t(size_t, 128UL, t->size * 2); + void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL); + if (!d) { ++ bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", ++ new_size); + return -ENOMEM; + } + @@ -47742,7 +49743,7 @@ index 000000000000..361dbf338023 + if (!u.bi_nlink) + continue; + -+ ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot); ++ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); + if (ret) { + *end = k.k->p.offset; + ret = 0; @@ -48008,16 +50009,17 @@ index 000000000000..264f2706b12d +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 -index 000000000000..ffce68a80490 +index 000000000000..14b0b595202d --- /dev/null +++ b/fs/bcachefs/inode.c -@@ -0,0 +1,744 @@ +@@ -0,0 +1,720 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "bkey_methods.h" +#include "btree_update.h" ++#include "buckets.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" @@ -48038,16 +50040,6 @@ index 000000000000..ffce68a80490 +}; + +static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; -+static const u8 bits_table[8] = { -+ 1 * 8 - 1, -+ 2 * 8 - 2, -+ 3 * 8 - 3, -+ 4 * 8 - 4, -+ 6 * 8 - 5, -+ 8 * 8 - 6, -+ 10 * 8 - 7, -+ 13 * 8 - 8, -+}; + +static int inode_decode_field(const u8 *in, const u8 *end, + u64 out[2], unsigned *out_bits) @@ -48275,15 +50267,13 @@ index 000000000000..ffce68a80490 + u32 snapshot; + int ret; + -+ if (0 && trans->c->opts.inodes_use_key_cache) -+ flags |= BTREE_ITER_CACHED; -+ + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, -+ SPOS(0, inum.inum, snapshot), flags); ++ SPOS(0, inum.inum, snapshot), ++ flags|BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) @@ -48608,76 +50598,62 @@ index 000000000000..ffce68a80490 +static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) +{ -+ u64 offset = 0; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i delete; ++ u32 snapshot; + int ret = 0; + -+ while (!ret || ret == -EINTR) { -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_i delete; -+ u32 snapshot; ++ /* ++ * We're never going to be deleting extents, no need to use an extent ++ * iterator: ++ */ ++ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); + ++ while (1) { + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) -+ continue; ++ goto err; + -+ bch2_trans_iter_init(trans, &iter, id, -+ SPOS(inum.inum, offset, snapshot), -+ BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek(&iter); -+ -+ if (!k.k || iter.pos.inode != inum.inum) { -+ bch2_trans_iter_exit(trans, &iter); -+ break; -+ } ++ bch2_btree_iter_set_snapshot(&iter, snapshot); + ++ k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + ret = bkey_err(k); + if (ret) + goto err; + ++ if (!k.k) ++ break; ++ + bkey_init(&delete.k); + delete.k.p = iter.pos; + -+ if (btree_node_type_is_extents(iter.btree_id)) { -+ unsigned max_sectors = -+ min_t(u64, U64_MAX - iter.pos.offset, -+ KEY_SIZE_MAX & (~0 << trans->c->block_bits)); -+ -+ /* create the biggest key we can */ -+ bch2_key_resize(&delete.k, max_sectors); -+ -+ ret = bch2_extent_trim_atomic(trans, &iter, &delete); -+ if (ret) -+ goto err; -+ } -+ + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: -+ offset = iter.pos.offset; -+ bch2_trans_iter_exit(trans, &iter); ++ if (ret && ret != -EINTR) ++ break; + } + ++ bch2_trans_iter_exit(trans, &iter); + return ret; +} + -+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached) ++int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) +{ + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; -+ unsigned iter_flags = BTREE_ITER_INTENT; + u32 snapshot; + int ret; + -+ if (0 && cached && c->opts.inodes_use_key_cache) -+ iter_flags |= BTREE_ITER_CACHED; -+ + bch2_trans_init(&trans, c, 0, 1024); + + /* @@ -48701,7 +50677,8 @@ index 000000000000..ffce68a80490 + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, -+ SPOS(0, inum.inum, snapshot), iter_flags); ++ SPOS(0, inum.inum, snapshot), ++ BTREE_ITER_INTENT|BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + + ret = bkey_err(k); @@ -48758,10 +50735,10 @@ index 000000000000..ffce68a80490 +} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 -index 000000000000..723186d8afb6 +index 000000000000..2337ecfc600e --- /dev/null +++ b/fs/bcachefs/inode.h -@@ -0,0 +1,200 @@ +@@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H @@ -48777,11 +50754,15 @@ index 000000000000..723186d8afb6 +#define bch2_bkey_ops_inode (struct bkey_ops) { \ + .key_invalid = bch2_inode_invalid, \ + .val_to_text = bch2_inode_to_text, \ ++ .trans_trigger = bch2_trans_mark_inode, \ ++ .atomic_trigger = bch2_mark_inode, \ +} + +#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ + .key_invalid = bch2_inode_v2_invalid, \ + .val_to_text = bch2_inode_to_text, \ ++ .trans_trigger = bch2_trans_mark_inode, \ ++ .atomic_trigger = bch2_mark_inode, \ +} + +static inline bool bkey_is_inode(const struct bkey *k) @@ -48851,7 +50832,7 @@ index 000000000000..723186d8afb6 +int bch2_inode_create(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, u32, u64); + -+int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); ++int bch2_inode_rm(struct bch_fs *, subvol_inum); + +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); @@ -48964,10 +50945,10 @@ index 000000000000..723186d8afb6 +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c new file mode 100644 -index 000000000000..5a3c9eff1b50 +index 000000000000..36929451af2c --- /dev/null +++ b/fs/bcachefs/io.c -@@ -0,0 +1,2375 @@ +@@ -0,0 +1,2416 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations @@ -49635,11 +51616,7 @@ index 000000000000..5a3c9eff1b50 +{ + struct bch_fs *c = op->c; + struct bkey_i_extent *e; -+ struct open_bucket *ob; -+ unsigned i; + -+ BUG_ON(crc.compressed_size > wp->sectors_free); -+ wp->sectors_free -= crc.compressed_size; + op->pos.offset += crc.uncompressed_size; + + e = bkey_extent_init(op->insert_keys.top); @@ -49652,22 +51629,8 @@ index 000000000000..5a3c9eff1b50 + crc.nonce) + bch2_extent_crc_append(&e->k_i, crc); + -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ union bch_extent_entry *end = -+ bkey_val_end(bkey_i_to_s(&e->k_i)); -+ -+ end->ptr = ob->ptr; -+ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ end->ptr.cached = !ca->mi.durability || -+ (op->flags & BCH_WRITE_CACHED) != 0; -+ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; -+ -+ e->k.u64s++; -+ -+ BUG_ON(crc.compressed_size > ob->sectors_free); -+ ob->sectors_free -= crc.compressed_size; -+ } ++ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size, ++ op->flags & BCH_WRITE_CACHED); + + bch2_keylist_push(&op->insert_keys); +} @@ -49708,7 +51671,7 @@ index 000000000000..5a3c9eff1b50 + */ + bch2_bio_alloc_pages_pool(c, bio, + min_t(unsigned, output_available, -+ c->sb.encoded_extent_max << 9)); ++ c->opts.encoded_extent_max)); + + if (bio->bi_iter.bi_size < output_available) + *page_alloc_failed = @@ -49752,6 +51715,7 @@ index 000000000000..5a3c9eff1b50 + struct bch_fs *c = op->c; + struct nonce nonce = extent_nonce(op->version, op->crc); + struct bch_csum csum; ++ int ret; + + if (!bch2_csum_type_is_encryption(op->crc.csum_type)) + return 0; @@ -49766,10 +51730,10 @@ index 000000000000..5a3c9eff1b50 + if (bch2_crc_cmp(op->crc.csum, csum)) + return -EIO; + -+ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; -+ return 0; ++ return ret; +} + +static enum prep_encoded_ret { @@ -49905,8 +51869,8 @@ index 000000000000..5a3c9eff1b50 + size_t dst_len, src_len; + + if (page_alloc_failed && -+ bio_sectors(dst) < wp->sectors_free && -+ bio_sectors(dst) < c->sb.encoded_extent_max) ++ dst->bi_iter.bi_size < (wp->sectors_free << 9) && ++ dst->bi_iter.bi_size < c->opts.encoded_extent_max) + break; + + BUG_ON(op->compression_type && @@ -49926,7 +51890,7 @@ index 000000000000..5a3c9eff1b50 + + if (op->csum_type) + dst_len = min_t(unsigned, dst_len, -+ c->sb.encoded_extent_max << 9); ++ c->opts.encoded_extent_max); + + if (bounce) { + swap(dst->bi_iter.bi_size, dst_len); @@ -49984,8 +51948,11 @@ index 000000000000..5a3c9eff1b50 + crc.live_size = src_len >> 9; + + swap(dst->bi_iter.bi_size, dst_len); -+ bch2_encrypt_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); ++ ret = bch2_encrypt_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ if (ret) ++ goto err; ++ + crc.csum = bch2_checksum_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + crc.csum_type = op->csum_type; @@ -50043,7 +52010,7 @@ index 000000000000..5a3c9eff1b50 + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + struct write_point *wp; -+ struct bio *bio; ++ struct bio *bio = NULL; + bool skip_put = true; + unsigned nofs_flags; + int ret; @@ -50081,7 +52048,7 @@ index 000000000000..5a3c9eff1b50 + */ + wp = bch2_alloc_sectors_start(c, + op->target, -+ op->opts.erasure_code, ++ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->write_point, + &op->devs_have, + op->nr_replicas, @@ -50259,7 +52226,7 @@ index 000000000000..5a3c9eff1b50 + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(bio)->put_bio = false; + -+ if (bio_sectors(bio) & (c->opts.block_size - 1)) { ++ if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { + bch_err_inum_ratelimited(c, op->pos.inode, + "misaligned write"); + op->error = -EIO; @@ -50760,6 +52727,7 @@ index 000000000000..5a3c9eff1b50 + struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; + struct bch_csum csum; ++ int ret; + + nofs_flags = memalloc_nofs_save(); + @@ -50794,7 +52762,10 @@ index 000000000000..5a3c9eff1b50 + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + + if (crc_is_compressed(crc)) { -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; ++ + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) + goto decompression_err; + } else { @@ -50805,7 +52776,9 @@ index 000000000000..5a3c9eff1b50 + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; @@ -50818,7 +52791,10 @@ index 000000000000..5a3c9eff1b50 + * Re encrypt data we decrypted, so it's consistent with + * rbio->crc: + */ -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; ++ + promote_start(rbio->promote, rbio); + rbio->promote = NULL; + } @@ -50853,6 +52829,11 @@ index 000000000000..5a3c9eff1b50 + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; ++decrypt_err: ++ bch_err_inum_ratelimited(c, rbio->read_pos.inode, ++ "decrypt error"); ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ goto out; +} + +static void bch2_read_endio(struct bio *bio) @@ -50881,9 +52862,8 @@ index 000000000000..5a3c9eff1b50 + return; + } + -+ if (rbio->pick.ptr.cached && -+ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || -+ ptr_stale(ca, &rbio->pick.ptr))) { ++ if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ ptr_stale(ca, &rbio->pick.ptr)) { + atomic_long_inc(&c->read_realloc_races); + + if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -50942,6 +52922,35 @@ index 000000000000..5a3c9eff1b50 + return ret; +} + ++static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, ++ struct bch_extent_ptr ptr) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); ++ struct btree_iter iter; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf.buf); ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)), ++ BTREE_ITER_CACHED); ++ ++ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ if (ret) ++ goto out; ++ ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch_err(c, "%s", buf.buf); ++ bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); ++ bch2_trans_iter_exit(trans, &iter); ++out: ++ printbuf_exit(&buf); ++} ++ +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, @@ -50951,7 +52960,7 @@ index 000000000000..5a3c9eff1b50 + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; -+ struct bch_dev *ca; ++ struct bch_dev *ca = NULL; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); @@ -50968,7 +52977,7 @@ index 000000000000..5a3c9eff1b50 + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } -+ ++retry_pick: + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ @@ -50981,8 +52990,27 @@ index 000000000000..5a3c9eff1b50 + goto err; + } + -+ if (pick_ret > 0) -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ /* ++ * Stale dirty pointers are treated as IO errors, but @failed isn't ++ * allocated unless we're in the retry path - so if we're not in the ++ * retry path, don't check here, it'll be caught in bch2_read_endio() ++ * and we'll end up in the retry path: ++ */ ++ if ((flags & BCH_READ_IN_RETRY) && ++ !pick.ptr.cached && ++ unlikely(ptr_stale(ca, &pick.ptr))) { ++ read_from_stale_dirty_pointer(trans, k, pick.ptr); ++ bch2_mark_io_failure(failed, &pick); ++ goto retry_pick; ++ } ++ ++ /* ++ * Unlock the iterator while the btree node's lock is still in ++ * cache, before doing the IO: ++ */ ++ bch2_trans_unlock(trans); + + if (flags & BCH_READ_NODECODE) { + /* @@ -51229,7 +53257,7 @@ index 000000000000..5a3c9eff1b50 + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), -+ BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); ++ BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; @@ -51270,12 +53298,6 @@ index 000000000000..5a3c9eff1b50 + */ + sectors = min(sectors, k.k->size - offset_into_extent); + -+ /* -+ * Unlock the iterator while the btree node's lock is still in -+ * cache, before doing the IO: -+ */ -+ bch2_trans_unlock(&trans); -+ + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + @@ -51336,8 +53358,8 @@ index 000000000000..5a3c9eff1b50 + mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, -+ c->sb.encoded_extent_max) / -+ PAGE_SECTORS, 0) || ++ c->opts.encoded_extent_max) / ++ PAGE_SIZE, 0) || + rhashtable_init(&c->promote_table, &bch_promote_params)) + return -ENOMEM; + @@ -51345,7 +53367,7 @@ index 000000000000..5a3c9eff1b50 +} diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h new file mode 100644 -index 000000000000..1aa422dccef7 +index 000000000000..fb5114518666 --- /dev/null +++ b/fs/bcachefs/io.h @@ -0,0 +1,189 @@ @@ -51401,7 +53423,7 @@ index 000000000000..1aa422dccef7 + +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ -+ return op->alloc_reserve == RESERVE_MOVINGGC ++ return op->alloc_reserve == RESERVE_movinggc + ? op->c->copygc_wq + : op->c->btree_update_wq; +} @@ -51430,7 +53452,7 @@ index 000000000000..1aa422dccef7 + op->compression_type = bch2_compression_opt_to_type[opts.compression]; + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; -+ op->alloc_reserve = RESERVE_NONE; ++ op->alloc_reserve = RESERVE_none; + op->incompressible = 0; + op->open_buckets.nr = 0; + op->devs_have.nr = 0; @@ -51707,10 +53729,10 @@ index 000000000000..78bff13d36f2 +#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 -index 000000000000..14bea8a2535e +index 000000000000..505e8367b5f2 --- /dev/null +++ b/fs/bcachefs/journal.c -@@ -0,0 +1,1284 @@ +@@ -0,0 +1,1410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs journalling code, for btree insertions @@ -51728,23 +53750,26 @@ index 000000000000..14bea8a2535e +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" ++#include "journal_sb.h" +#include "journal_seq_blacklist.h" -+#include "super-io.h" + +#include + -+static u64 last_unwritten_seq(struct journal *j) -+{ -+ union journal_res_state s = READ_ONCE(j->reservations); ++#define x(n) #n, ++static const char * const bch2_journal_watermarks[] = { ++ JOURNAL_WATERMARKS() ++ NULL ++}; + -+ lockdep_assert_held(&j->lock); -+ -+ return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); -+} ++static const char * const bch2_journal_errors[] = { ++ JOURNAL_ERRORS() ++ NULL ++}; ++#undef x + +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ -+ return seq >= last_unwritten_seq(j); ++ return seq > j->seq_ondisk; +} + +static bool __journal_entry_is_open(union journal_res_state state) @@ -51752,6 +53777,11 @@ index 000000000000..14bea8a2535e + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + ++static inline unsigned nr_unwritten_journal_entries(struct journal *j) ++{ ++ return atomic64_read(&j->seq) - j->seq_ondisk; ++} ++ +static bool journal_entry_is_open(struct journal *j) +{ + return __journal_entry_is_open(j->reservations); @@ -51763,8 +53793,6 @@ index 000000000000..14bea8a2535e + struct journal_buf *buf = NULL; + + EBUG_ON(seq > journal_cur_seq(j)); -+ EBUG_ON(seq == journal_cur_seq(j) && -+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); + + if (journal_seq_unwritten(j, seq)) { + buf = j->buf + (seq & JOURNAL_BUF_MASK); @@ -51782,54 +53810,6 @@ index 000000000000..14bea8a2535e + p->devs.nr = 0; +} + -+static void journal_pin_new_entry(struct journal *j) -+{ -+ /* -+ * The fifo_push() needs to happen at the same time as j->seq is -+ * incremented for journal_last_seq() to be calculated correctly -+ */ -+ atomic64_inc(&j->seq); -+ journal_pin_list_init(fifo_push_ref(&j->pin), 1); -+} -+ -+static void bch2_journal_buf_init(struct journal *j) -+{ -+ struct journal_buf *buf = journal_cur_buf(j); -+ -+ bkey_extent_init(&buf->key); -+ buf->noflush = false; -+ buf->must_flush = false; -+ buf->separate_flush = false; -+ -+ memset(buf->data, 0, sizeof(*buf->data)); -+ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); -+ buf->data->u64s = 0; -+} -+ -+void bch2_journal_halt(struct journal *j) -+{ -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ -+ do { -+ old.v = new.v = v; -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return; -+ -+ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ /* -+ * XXX: we're not using j->lock here because this can be called from -+ * interrupt context, this can race with journal_write_done() -+ */ -+ if (!j->err_seq) -+ j->err_seq = journal_cur_seq(j); -+ journal_wake(j); -+ closure_wake_up(&journal_cur_buf(j)->wait); -+} -+ +/* journal entry close/open: */ + +void __bch2_journal_buf_put(struct journal *j) @@ -51845,7 +53825,7 @@ index 000000000000..14bea8a2535e + * We don't close a journal_buf until the next journal_buf is finished writing, + * and can be opened again - this also initializes the next journal_buf: + */ -+static bool __journal_entry_close(struct journal *j) ++static void __journal_entry_close(struct journal *j, unsigned closed_val) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = journal_cur_buf(j); @@ -51853,34 +53833,24 @@ index 000000000000..14bea8a2535e + u64 v = atomic64_read(&j->reservations.counter); + unsigned sectors; + ++ BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && ++ closed_val != JOURNAL_ENTRY_ERROR_VAL); ++ + lockdep_assert_held(&j->lock); + + do { + old.v = new.v = v; -+ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) -+ return true; ++ new.cur_entry_offset = closed_val; + -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { -+ /* this entry will never be written: */ -+ closure_wake_up(&buf->wait); -+ return true; -+ } -+ -+ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { -+ set_bit(JOURNAL_NEED_WRITE, &j->flags); -+ j->need_write_time = local_clock(); -+ } -+ -+ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; -+ new.idx++; -+ -+ if (new.idx == new.unwritten_idx) -+ return false; -+ -+ BUG_ON(journal_state_count(new, new.idx)); ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || ++ old.cur_entry_offset == new.cur_entry_offset) ++ return; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + ++ if (!__journal_entry_is_open(old)) ++ return; ++ + /* Close out old buffer: */ + buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + @@ -51910,36 +53880,42 @@ index 000000000000..14bea8a2535e + */ + buf->last_seq = journal_last_seq(j); + buf->data->last_seq = cpu_to_le64(buf->last_seq); ++ BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); + + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + -+ /* Initialize new buffer: */ -+ journal_pin_new_entry(j); -+ -+ bch2_journal_buf_init(j); -+ + cancel_delayed_work(&j->write_work); -+ clear_bit(JOURNAL_NEED_WRITE, &j->flags); + + bch2_journal_space_available(j); + + bch2_journal_buf_put(j, old.idx); -+ return true; ++} ++ ++void bch2_journal_halt(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); ++ if (!j->err_seq) ++ j->err_seq = journal_cur_seq(j); ++ spin_unlock(&j->lock); +} + +static bool journal_entry_want_write(struct journal *j) +{ -+ union journal_res_state s = READ_ONCE(j->reservations); -+ bool ret = false; ++ bool ret = !journal_entry_is_open(j) || ++ journal_cur_seq(j) == journal_last_unwritten_seq(j); + -+ /* -+ * Don't close it yet if we already have a write in flight, but do set -+ * NEED_WRITE: -+ */ -+ if (s.idx != s.unwritten_idx) -+ set_bit(JOURNAL_NEED_WRITE, &j->flags); -+ else -+ ret = __journal_entry_close(j); ++ /* Don't close it yet if we already have a write in flight: */ ++ if (ret) ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++ else if (nr_unwritten_journal_entries(j)) { ++ struct journal_buf *buf = journal_cur_buf(j); ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } ++ } + + return ret; +} @@ -51968,34 +53944,71 @@ index 000000000000..14bea8a2535e +static int journal_entry_open(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf = journal_cur_buf(j); ++ struct journal_buf *buf = j->buf + ++ ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); + union journal_res_state old, new; + int u64s; + u64 v; + -+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); -+ + lockdep_assert_held(&j->lock); + BUG_ON(journal_entry_is_open(j)); ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + if (j->blocked) -+ return cur_entry_blocked; ++ return JOURNAL_ERR_blocked; + + if (j->cur_entry_error) + return j->cur_entry_error; + ++ if (bch2_journal_error(j)) ++ return JOURNAL_ERR_insufficient_devices; /* -EROFS */ ++ ++ if (!fifo_free(&j->pin)) ++ return JOURNAL_ERR_journal_pin_full; ++ ++ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1) ++ return JOURNAL_ERR_max_in_flight; ++ + BUG_ON(!j->cur_entry_sectors); + ++ buf->expires = ++ (journal_cur_seq(j) == j->flushed_seq_ondisk ++ ? jiffies ++ : j->last_flush_write) + ++ msecs_to_jiffies(c->opts.journal_flush_delay); ++ + buf->u64s_reserved = j->entry_u64s_reserved; + buf->disk_sectors = j->cur_entry_sectors; + buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); + + u64s = (int) (buf->sectors << 9) / sizeof(u64) - + journal_entry_overhead(j); -+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); ++ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + -+ if (u64s <= le32_to_cpu(buf->data->u64s)) -+ return cur_entry_journal_full; ++ if (u64s <= 0) ++ return JOURNAL_ERR_journal_full; ++ ++ if (fifo_empty(&j->pin) && j->reclaim_thread) ++ wake_up_process(j->reclaim_thread); ++ ++ /* ++ * The fifo_push() needs to happen at the same time as j->seq is ++ * incremented for journal_last_seq() to be calculated correctly ++ */ ++ atomic64_inc(&j->seq); ++ journal_pin_list_init(fifo_push_ref(&j->pin), 1); ++ ++ BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); ++ ++ bkey_extent_init(&buf->key); ++ buf->noflush = false; ++ buf->must_flush = false; ++ buf->separate_flush = false; ++ buf->flush_time = 0; ++ ++ memset(buf->data, 0, sizeof(*buf->data)); ++ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ++ buf->data->u64s = 0; + + /* + * Must be set before marking the journal entry as open: @@ -52006,14 +54019,14 @@ index 000000000000..14bea8a2535e + do { + old.v = new.v = v; + -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return cur_entry_insufficient_devices; ++ BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); + -+ /* Handle any already added entries */ -+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); ++ new.idx++; ++ BUG_ON(journal_state_count(new, new.idx)); ++ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); + -+ EBUG_ON(journal_state_count(new, new.idx)); + journal_state_inc(&new); ++ new.cur_entry_offset = 0; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + @@ -52024,15 +54037,14 @@ index 000000000000..14bea8a2535e + + mod_delayed_work(c->io_complete_wq, + &j->write_work, -+ msecs_to_jiffies(j->write_delay_ms)); ++ msecs_to_jiffies(c->opts.journal_flush_delay)); + journal_wake(j); + return 0; +} + +static bool journal_quiesced(struct journal *j) +{ -+ union journal_res_state s = READ_ONCE(j->reservations); -+ bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); ++ bool ret = atomic64_read(&j->seq) == j->seq_ondisk; + + if (!ret) + journal_entry_close(j); @@ -52047,8 +54059,21 @@ index 000000000000..14bea8a2535e +static void journal_write_work(struct work_struct *work) +{ + struct journal *j = container_of(work, struct journal, write_work.work); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ long delta; + -+ journal_entry_close(j); ++ spin_lock(&j->lock); ++ if (!__journal_entry_is_open(j->reservations)) ++ goto unlock; ++ ++ delta = journal_cur_buf(j)->expires - jiffies; ++ ++ if (delta > 0) ++ mod_delayed_work(c->io_complete_wq, &j->write_work, delta); ++ else ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++unlock: ++ spin_unlock(&j->lock); +} + +static int __journal_res_get(struct journal *j, struct journal_res *res, @@ -52077,13 +54102,12 @@ index 000000000000..14bea8a2535e + return 0; + } + -+ if (!(flags & JOURNAL_RES_GET_RESERVED) && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) { + /* + * Don't want to close current journal entry, just need to + * invoke reclaim: + */ -+ ret = cur_entry_journal_full; ++ ret = JOURNAL_ERR_journal_full; + goto unlock; + } + @@ -52098,20 +54122,13 @@ index 000000000000..14bea8a2535e + buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); + -+ if (journal_entry_is_open(j) && -+ !__journal_entry_close(j)) { -+ /* -+ * We failed to get a reservation on the current open journal -+ * entry because it's full, and we can't close it because -+ * there's still a previous one in flight: -+ */ ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++ ret = journal_entry_open(j); ++ ++ if (ret == JOURNAL_ERR_max_in_flight) + trace_journal_entry_full(c); -+ ret = cur_entry_blocked; -+ } else { -+ ret = journal_entry_open(j); -+ } +unlock: -+ if ((ret && ret != cur_entry_insufficient_devices) && ++ if ((ret && ret != JOURNAL_ERR_insufficient_devices) && + !j->res_get_blocked_start) { + j->res_get_blocked_start = local_clock() ?: 1; + trace_journal_full(c); @@ -52123,23 +54140,24 @@ index 000000000000..14bea8a2535e + if (!ret) + goto retry; + -+ if ((ret == cur_entry_journal_full || -+ ret == cur_entry_journal_pin_full) && ++ if ((ret == JOURNAL_ERR_journal_full || ++ ret == JOURNAL_ERR_journal_pin_full) && + !can_discard && -+ j->reservations.idx == j->reservations.unwritten_idx && -+ (flags & JOURNAL_RES_GET_RESERVED)) { -+ char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC); ++ !nr_unwritten_journal_entries(j) && ++ (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) { ++ struct printbuf buf = PRINTBUF; + -+ bch_err(c, "Journal stuck!"); -+ if (journal_debug_buf) { -+ bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); -+ bch_err(c, "%s", journal_debug_buf); ++ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)", ++ bch2_journal_errors[ret]); + -+ bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j); -+ bch_err(c, "Journal pins:\n%s", journal_debug_buf); -+ kfree(journal_debug_buf); -+ } ++ bch2_journal_debug_to_text(&buf, j); ++ bch_err(c, "%s", buf.buf); + ++ printbuf_reset(&buf); ++ bch2_journal_pins_to_text(&buf, j); ++ bch_err(c, "Journal pins:\n%s", buf.buf); ++ ++ printbuf_exit(&buf); + bch2_fatal_error(c); + dump_stack(); + } @@ -52148,8 +54166,8 @@ index 000000000000..14bea8a2535e + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ -+ if ((ret == cur_entry_journal_full || -+ ret == cur_entry_journal_pin_full) && ++ if ((ret == JOURNAL_ERR_journal_full || ++ ret == JOURNAL_ERR_journal_pin_full) && + !(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); @@ -52162,7 +54180,7 @@ index 000000000000..14bea8a2535e + } + } + -+ return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; ++ return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN; +} + +/* @@ -52241,7 +54259,7 @@ index 000000000000..14bea8a2535e + /* + * Not enough room in current journal entry, have to flush it: + */ -+ __journal_entry_close(j); ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + } else { + journal_cur_buf(j)->u64s_reserved += d; + } @@ -52286,12 +54304,15 @@ index 000000000000..14bea8a2535e + } + + /* if seq was written, but not flushed - flush a newer one instead */ -+ seq = max(seq, last_unwritten_seq(j)); ++ seq = max(seq, journal_last_unwritten_seq(j)); + +recheck_need_open: -+ if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { ++ if (seq > journal_cur_seq(j)) { + struct journal_res res = { 0 }; + ++ if (journal_entry_is_open(j)) ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++ + spin_unlock(&j->lock); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); @@ -52301,7 +54322,11 @@ index 000000000000..14bea8a2535e + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; -+ set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); @@ -52339,6 +54364,12 @@ index 000000000000..14bea8a2535e + u64 start_time = local_clock(); + int ret, ret2; + ++ /* ++ * Don't update time_stats when @seq is already flushed: ++ */ ++ if (seq <= j->flushed_seq_ondisk) ++ return 0; ++ + ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + + if (!ret) @@ -52347,8 +54378,61 @@ index 000000000000..14bea8a2535e + return ret ?: ret2 < 0 ? ret2 : 0; +} + ++/* ++ * bch2_journal_flush_async - if there is an open journal entry, or a journal ++ * still being written, write it and wait for the write to complete ++ */ ++void bch2_journal_flush_async(struct journal *j, struct closure *parent) ++{ ++ bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); ++} ++ ++int bch2_journal_flush(struct journal *j) ++{ ++ return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); ++} ++ ++/* ++ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before ++ * @seq ++ */ ++bool bch2_journal_noflush_seq(struct journal *j, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ u64 unwritten_seq; ++ bool ret = false; ++ ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) ++ return false; ++ ++ if (seq <= c->journal.flushed_seq_ondisk) ++ return false; ++ ++ spin_lock(&j->lock); ++ if (seq <= c->journal.flushed_seq_ondisk) ++ goto out; ++ ++ for (unwritten_seq = journal_last_unwritten_seq(j); ++ unwritten_seq < seq; ++ unwritten_seq++) { ++ struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); ++ ++ /* journal write is already in flight, and was a flush write: */ ++ if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) ++ goto out; ++ ++ buf->noflush = true; ++ } ++ ++ ret = true; ++out: ++ spin_unlock(&j->lock); ++ return ret; ++} ++ +int bch2_journal_meta(struct journal *j) +{ ++ struct journal_buf *buf; + struct journal_res res; + int ret; + @@ -52358,53 +54442,50 @@ index 000000000000..14bea8a2535e + if (ret) + return ret; + ++ buf = j->buf + (res.seq & JOURNAL_BUF_MASK); ++ buf->must_flush = true; ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } ++ + bch2_journal_res_put(j, &res); + + return bch2_journal_flush_seq(j, res.seq); +} + -+/* -+ * bch2_journal_flush_async - if there is an open journal entry, or a journal -+ * still being written, write it and wait for the write to complete -+ */ -+void bch2_journal_flush_async(struct journal *j, struct closure *parent) ++int bch2_journal_log_msg(struct journal *j, const char *fmt, ...) +{ -+ u64 seq, journal_seq; ++ struct jset_entry_log *entry; ++ struct journal_res res = { 0 }; ++ unsigned msglen, u64s; ++ va_list args; ++ int ret; + -+ spin_lock(&j->lock); -+ journal_seq = journal_cur_seq(j); ++ va_start(args, fmt); ++ msglen = vsnprintf(NULL, 0, fmt, args) + 1; ++ va_end(args); + -+ if (journal_entry_is_open(j)) { -+ seq = journal_seq; -+ } else if (journal_seq) { -+ seq = journal_seq - 1; -+ } else { -+ spin_unlock(&j->lock); -+ return; -+ } -+ spin_unlock(&j->lock); ++ u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64))); + -+ bch2_journal_flush_seq_async(j, seq, parent); -+} ++ ret = bch2_journal_res_get(j, &res, u64s, 0); ++ if (ret) ++ return ret; + -+int bch2_journal_flush(struct journal *j) -+{ -+ u64 seq, journal_seq; ++ entry = container_of(journal_res_entry(j, &res), ++ struct jset_entry_log, entry);; ++ memset(entry, 0, u64s * sizeof(u64)); ++ entry->entry.type = BCH_JSET_ENTRY_log; ++ entry->entry.u64s = u64s - 1; + -+ spin_lock(&j->lock); -+ journal_seq = journal_cur_seq(j); ++ va_start(args, fmt); ++ vsnprintf(entry->d, INT_MAX, fmt, args); ++ va_end(args); + -+ if (journal_entry_is_open(j)) { -+ seq = journal_seq; -+ } else if (journal_seq) { -+ seq = journal_seq - 1; -+ } else { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ spin_unlock(&j->lock); ++ bch2_journal_res_put(j, &res); + -+ return bch2_journal_flush_seq(j, seq); ++ return bch2_journal_flush_seq(j, res.seq); +} + +/* block/unlock the journal: */ @@ -52434,28 +54515,53 @@ index 000000000000..14bea8a2535e +{ + struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; -+ struct bch_sb_field_journal *journal_buckets; + u64 *new_bucket_seq = NULL, *new_buckets = NULL; ++ struct open_bucket **ob = NULL; ++ long *bu = NULL; ++ unsigned i, nr_got = 0, nr_want = nr - ja->nr; ++ unsigned old_nr = ja->nr; ++ unsigned old_discard_idx = ja->discard_idx; ++ unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk; ++ unsigned old_dirty_idx = ja->dirty_idx; ++ unsigned old_cur_idx = ja->cur_idx; + int ret = 0; + -+ /* don't handle reducing nr of buckets yet: */ -+ if (nr <= ja->nr) -+ return 0; ++ if (c) { ++ bch2_journal_block(&c->journal); ++ bch2_journal_flush_all_pins(&c->journal); ++ } + ++ bu = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL); ++ ob = kzalloc(nr_want * sizeof(*ob), GFP_KERNEL); + new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); + new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ if (!new_buckets || !new_bucket_seq) { ++ if (!bu || !ob || !new_buckets || !new_bucket_seq) { + ret = -ENOMEM; -+ goto err; ++ goto err_unblock; + } + -+ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, -+ nr + sizeof(*journal_buckets) / sizeof(u64)); -+ if (!journal_buckets) { -+ ret = -ENOSPC; -+ goto err; ++ for (nr_got = 0; nr_got < nr_want; nr_got++) { ++ if (new_fs) { ++ bu[nr_got] = bch2_bucket_alloc_new_fs(ca); ++ if (bu[nr_got] < 0) { ++ ret = -ENOSPC; ++ break; ++ } ++ } else { ++ ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, ++ false, cl); ++ if (IS_ERR(ob[nr_got])) { ++ ret = cl ? -EAGAIN : -ENOSPC; ++ break; ++ } ++ ++ bu[nr_got] = ob[nr_got]->bucket; ++ } + } + ++ if (!nr_got) ++ goto err_unblock; ++ + /* + * We may be called from the device add path, before the new device has + * actually been added to the running filesystem: @@ -52468,54 +54574,16 @@ index 000000000000..14bea8a2535e + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + -+ if (!new_fs) -+ spin_unlock(&c->journal.lock); ++ for (i = 0; i < nr_got; i++) { ++ unsigned pos = ja->discard_idx ?: ja->nr; ++ long b = bu[i]; + -+ while (ja->nr < nr) { -+ struct open_bucket *ob = NULL; -+ unsigned pos; -+ long b; -+ -+ if (new_fs) { -+ if (c) -+ percpu_down_read(&c->mark_lock); -+ b = bch2_bucket_alloc_new_fs(ca); -+ if (b < 0) { -+ percpu_up_read(&c->mark_lock); -+ ret = -ENOSPC; -+ goto err; -+ } -+ } else { -+ rcu_read_lock(); -+ ob = bch2_bucket_alloc(c, ca, RESERVE_NONE, -+ false, cl); -+ rcu_read_unlock(); -+ if (IS_ERR(ob)) { -+ ret = cl ? -EAGAIN : -ENOSPC; -+ goto err; -+ } -+ -+ b = sector_to_bucket(ca, ob->ptr.offset); -+ } -+ -+ if (c) -+ spin_lock(&c->journal.lock); -+ -+ /* -+ * XXX -+ * For resize at runtime, we should be writing the new -+ * superblock before inserting into the journal array -+ */ -+ -+ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; + __array_insert_item(ja->buckets, ja->nr, pos); + __array_insert_item(ja->bucket_seq, ja->nr, pos); -+ __array_insert_item(journal_buckets->buckets, ja->nr, pos); + ja->nr++; + + ja->buckets[pos] = b; + ja->bucket_seq[pos] = 0; -+ journal_buckets->buckets[pos] = cpu_to_le64(b); + + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; @@ -52525,36 +54593,56 @@ index 000000000000..14bea8a2535e + ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ } + -+ if (c) -+ spin_unlock(&c->journal.lock); ++ ret = bch2_journal_buckets_to_sb(c, ca); ++ if (ret) { ++ /* Revert: */ ++ swap(new_buckets, ja->buckets); ++ swap(new_bucket_seq, ja->bucket_seq); ++ ja->nr = old_nr; ++ ja->discard_idx = old_discard_idx; ++ ja->dirty_idx_ondisk = old_dirty_idx_ondisk; ++ ja->dirty_idx = old_dirty_idx; ++ ja->cur_idx = old_cur_idx; ++ } + -+ if (new_fs) { -+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, -+ ca->mi.bucket_size, -+ gc_phase(GC_PHASE_SB), -+ 0); -+ if (c) -+ percpu_up_read(&c->mark_lock); -+ } else { ++ if (!new_fs) ++ spin_unlock(&c->journal.lock); ++ ++ if (c) ++ bch2_journal_unblock(&c->journal); ++ ++ if (ret) ++ goto err; ++ ++ if (!new_fs) { ++ for (i = 0; i < nr_got; i++) { + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_mark_metadata_bucket(&trans, ca, -+ b, BCH_DATA_journal, ++ bu[i], BCH_DATA_journal, + ca->mi.bucket_size)); -+ -+ bch2_open_bucket_put(c, ob); -+ -+ if (ret) ++ if (ret) { ++ bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret); + goto err; ++ } + } + } +err: -+ bch2_sb_resize_journal(&ca->disk_sb, -+ ja->nr + sizeof(*journal_buckets) / sizeof(u64)); ++ if (ob && !new_fs) ++ for (i = 0; i < nr_got; i++) ++ bch2_open_bucket_put(c, ob[i]); ++ + kfree(new_bucket_seq); + kfree(new_buckets); ++ kfree(ob); ++ kfree(bu); + + return ret; ++err_unblock: ++ if (c) ++ bch2_journal_unblock(&c->journal); ++ goto err; +} + +/* @@ -52567,11 +54655,15 @@ index 000000000000..14bea8a2535e + struct journal_device *ja = &ca->journal; + struct closure cl; + unsigned current_nr; -+ int ret; ++ int ret = 0; ++ ++ /* don't handle reducing nr of buckets yet: */ ++ if (nr < ja->nr) ++ return 0; + + closure_init_stack(&cl); + -+ do { ++ while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) { + struct disk_reservation disk_res = { 0, 0 }; + + closure_sync(&cl); @@ -52599,7 +54691,7 @@ index 000000000000..14bea8a2535e + if (ja->nr != current_nr) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); -+ } while (ret == -EAGAIN); ++ } + + return ret; +} @@ -52630,17 +54722,16 @@ index 000000000000..14bea8a2535e + +static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) +{ -+ union journal_res_state state; + bool ret = false; -+ unsigned i; ++ u64 seq; + + spin_lock(&j->lock); -+ state = READ_ONCE(j->reservations); -+ i = state.idx; ++ for (seq = journal_last_unwritten_seq(j); ++ seq <= journal_cur_seq(j) && !ret; ++ seq++) { ++ struct journal_buf *buf = journal_seq_to_buf(j, seq); + -+ while (i != state.unwritten_idx) { -+ i = (i - 1) & JOURNAL_BUF_MASK; -+ if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) ++ if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx)) + ret = true; + } + spin_unlock(&j->lock); @@ -52655,6 +54746,7 @@ index 000000000000..14bea8a2535e + +void bch2_fs_journal_stop(struct journal *j) +{ ++ bch2_journal_reclaim_stop(j); + bch2_journal_flush_all_pins(j); + + wait_event(j->wait, journal_entry_close(j)); @@ -52669,11 +54761,9 @@ index 000000000000..14bea8a2535e + + BUG_ON(!bch2_journal_error(j) && + test_bit(JOURNAL_REPLAY_DONE, &j->flags) && -+ (journal_entry_is_open(j) || -+ j->last_empty_seq + 1 != journal_cur_seq(j))); ++ j->last_empty_seq != journal_cur_seq(j)); + + cancel_delayed_work_sync(&j->write_work); -+ bch2_journal_reclaim_stop(j); +} + +int bch2_fs_journal_start(struct journal *j, u64 cur_seq, @@ -52702,10 +54792,15 @@ index 000000000000..14bea8a2535e + j->replay_journal_seq = last_seq; + j->replay_journal_seq_end = cur_seq; + j->last_seq_ondisk = last_seq; ++ j->flushed_seq_ondisk = cur_seq - 1; ++ j->seq_ondisk = cur_seq - 1; + j->pin.front = last_seq; + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + ++ if (list_empty(journal_entries)) ++ j->last_empty_seq = cur_seq - 1; ++ + fifo_for_each_entry_ptr(p, &j->pin, seq) + journal_pin_list_init(p, 1); + @@ -52718,6 +54813,9 @@ index 000000000000..14bea8a2535e + if (seq < last_seq) + continue; + ++ if (journal_entry_empty(&i->j)) ++ j->last_empty_seq = le64_to_cpu(i->j.seq); ++ + p = journal_seq_pin(j, seq); + + p->devs.nr = 0; @@ -52725,16 +54823,16 @@ index 000000000000..14bea8a2535e + bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + } + ++ if (list_empty(journal_entries)) ++ j->last_empty_seq = cur_seq; ++ + spin_lock(&j->lock); + + set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; + -+ journal_pin_new_entry(j); -+ + j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); -+ -+ bch2_journal_buf_init(j); ++ j->reservations.unwritten_idx++; + + c->last_bucket_seq_cleanup = journal_cur_seq(j); + @@ -52762,9 +54860,20 @@ index 000000000000..14bea8a2535e + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch2_sb_get_journal(sb); ++ struct bch_sb_field_journal_v2 *journal_buckets_v2 = ++ bch2_sb_get_journal_v2(sb); + unsigned i; + -+ ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ ja->nr = 0; ++ ++ if (journal_buckets_v2) { ++ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); ++ ++ for (i = 0; i < nr; i++) ++ ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); ++ } else if (journal_buckets) { ++ ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ } + + ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->bucket_seq) @@ -52779,8 +54888,18 @@ index 000000000000..14bea8a2535e + if (!ja->buckets) + return -ENOMEM; + -+ for (i = 0; i < ja->nr; i++) -+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ if (journal_buckets_v2) { ++ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); ++ unsigned j, dst = 0; ++ ++ for (i = 0; i < nr; i++) ++ for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ++ ja->buckets[dst++] = ++ le64_to_cpu(journal_buckets_v2->d[i].start) + j; ++ } else if (journal_buckets) { ++ for (i = 0; i < ja->nr; i++) ++ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ } + + return 0; +} @@ -52814,9 +54933,6 @@ index 000000000000..14bea8a2535e + + lockdep_init_map(&j->res_map, "journal res", &res_key, 0); + -+ j->write_delay_ms = 1000; -+ j->reclaim_delay_ms = 100; -+ + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); @@ -52848,75 +54964,81 @@ index 000000000000..14bea8a2535e + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union journal_res_state s; + struct bch_dev *ca; ++ unsigned long now = jiffies; ++ u64 seq; + unsigned i; + ++ out->atomic++; ++ out->tabstops[0] = 24; ++ + rcu_read_lock(); + s = READ_ONCE(j->reservations); + -+ pr_buf(out, -+ "active journal entries:\t%llu\n" -+ "seq:\t\t\t%llu\n" -+ "last_seq:\t\t%llu\n" -+ "last_seq_ondisk:\t%llu\n" -+ "flushed_seq_ondisk:\t%llu\n" -+ "prereserved:\t\t%u/%u\n" -+ "each entry reserved:\t%u\n" -+ "nr flush writes:\t%llu\n" -+ "nr noflush writes:\t%llu\n" -+ "nr direct reclaim:\t%llu\n" -+ "nr background reclaim:\t%llu\n" -+ "reclaim kicked:\t\t%u\n" -+ "reclaim runs in:\t%u ms\n" -+ "current entry sectors:\t%u\n" -+ "current entry error:\t%u\n" -+ "current entry:\t\t", -+ fifo_used(&j->pin), -+ journal_cur_seq(j), -+ journal_last_seq(j), -+ j->last_seq_ondisk, -+ j->flushed_seq_ondisk, -+ j->prereserved.reserved, -+ j->prereserved.remaining, -+ j->entry_u64s_reserved, -+ j->nr_flush_writes, -+ j->nr_noflush_writes, -+ j->nr_direct_reclaim, -+ j->nr_background_reclaim, -+ j->reclaim_kicked, -+ jiffies_to_msecs(j->next_reclaim - jiffies), -+ j->cur_entry_sectors, -+ j->cur_entry_error); ++ pr_buf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size); ++ pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); ++ pr_buf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); ++ pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); ++ pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); ++ pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); ++ pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); ++ pr_buf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]); ++ pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); ++ pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); ++ pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); ++ pr_buf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); ++ pr_buf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); ++ pr_buf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); ++ pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ++ ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); ++ pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); ++ pr_buf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); ++ pr_buf(out, "current entry:\t\t"); + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: -+ pr_buf(out, "error\n"); ++ pr_buf(out, "error"); + break; + case JOURNAL_ENTRY_CLOSED_VAL: -+ pr_buf(out, "closed\n"); ++ pr_buf(out, "closed"); + break; + default: -+ pr_buf(out, "%u/%u\n", -+ s.cur_entry_offset, -+ j->cur_entry_u64s); ++ pr_buf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); + break; + } + -+ pr_buf(out, -+ "current entry:\t\tidx %u refcount %u\n", -+ s.idx, journal_state_count(s, s.idx)); ++ pr_newline(out); + -+ i = s.idx; -+ while (i != s.unwritten_idx) { -+ i = (i - 1) & JOURNAL_BUF_MASK; ++ for (seq = journal_cur_seq(j); ++ seq >= journal_last_unwritten_seq(j); ++ --seq) { ++ i = seq & JOURNAL_BUF_MASK; + -+ pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", -+ i, journal_state_count(s, i), j->buf[i].sectors); ++ pr_buf(out, "unwritten entry:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", seq); ++ pr_newline(out); ++ pr_indent_push(out, 2); ++ ++ pr_buf(out, "refcount:"); ++ pr_tab(out); ++ pr_buf(out, "%u", journal_state_count(s, i)); ++ pr_newline(out); ++ ++ pr_buf(out, "sectors:"); ++ pr_tab(out); ++ pr_buf(out, "%u", j->buf[i].sectors); ++ pr_newline(out); ++ ++ pr_buf(out, "expires"); ++ pr_tab(out); ++ pr_buf(out, "%li jiffies", j->buf[i].expires - jiffies); ++ pr_newline(out); ++ ++ pr_indent_pop(out, 2); + } + + pr_buf(out, -+ "need write:\t\t%i\n" + "replay done:\t\t%i\n", -+ test_bit(JOURNAL_NEED_WRITE, &j->flags), + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + + pr_buf(out, "space:\n"); @@ -52943,25 +55065,19 @@ index 000000000000..14bea8a2535e + if (!ja->nr) + continue; + -+ pr_buf(out, -+ "dev %u:\n" -+ "\tnr\t\t%u\n" -+ "\tbucket size\t%u\n" -+ "\tavailable\t%u:%u\n" -+ "\tdiscard_idx\t%u\n" -+ "\tdirty_ondisk\t%u (seq %llu)\n" -+ "\tdirty_idx\t%u (seq %llu)\n" -+ "\tcur_idx\t\t%u (seq %llu)\n", -+ i, ja->nr, ca->mi.bucket_size, -+ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), -+ ja->sectors_free, -+ ja->discard_idx, -+ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], -+ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], -+ ja->cur_idx, ja->bucket_seq[ja->cur_idx]); ++ pr_buf(out, "dev %u:\n", i); ++ pr_buf(out, "\tnr\t\t%u\n", ja->nr); ++ pr_buf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); ++ pr_buf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); ++ pr_buf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); ++ pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); ++ pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); ++ pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + } + + rcu_read_unlock(); ++ ++ --out->atomic; +} + +void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) @@ -52971,36 +55087,68 @@ index 000000000000..14bea8a2535e + spin_unlock(&j->lock); +} + -+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; -+ u64 i; + + spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(pin_list, &j->pin, i) { -+ pr_buf(out, "%llu: count %u\n", -+ i, atomic_read(&pin_list->count)); ++ *seq = max(*seq, j->pin.front); + -+ list_for_each_entry(pin, &pin_list->list, list) -+ pr_buf(out, "\t%px %ps\n", -+ pin, pin->flush); -+ -+ if (!list_empty(&pin_list->flushed)) -+ pr_buf(out, "flushed:\n"); -+ -+ list_for_each_entry(pin, &pin_list->flushed, list) -+ pr_buf(out, "\t%px %ps\n", -+ pin, pin->flush); ++ if (*seq >= j->pin.back) { ++ spin_unlock(&j->lock); ++ return true; + } ++ ++ out->atomic++; ++ ++ pin_list = journal_seq_pin(j, *seq); ++ ++ pr_buf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); ++ pr_newline(out); ++ pr_indent_push(out, 2); ++ ++ list_for_each_entry(pin, &pin_list->list, list) { ++ pr_buf(out, "\t%px %ps", pin, pin->flush); ++ pr_newline(out); ++ } ++ ++ list_for_each_entry(pin, &pin_list->key_cache_list, list) { ++ pr_buf(out, "\t%px %ps", pin, pin->flush); ++ pr_newline(out); ++ } ++ ++ if (!list_empty(&pin_list->flushed)) { ++ pr_buf(out, "flushed:"); ++ pr_newline(out); ++ } ++ ++ list_for_each_entry(pin, &pin_list->flushed, list) { ++ pr_buf(out, "\t%px %ps", pin, pin->flush); ++ pr_newline(out); ++ } ++ ++ pr_indent_pop(out, 2); ++ ++ --out->atomic; + spin_unlock(&j->lock); ++ ++ return false; ++} ++ ++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++{ ++ u64 seq = 0; ++ ++ while (!bch2_journal_seq_pins_to_text(out, j, &seq)) ++ seq++; +} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h new file mode 100644 -index 000000000000..c39cbbf1bccd +index 000000000000..e7321c327d9d --- /dev/null +++ b/fs/bcachefs/journal.h -@@ -0,0 +1,519 @@ +@@ -0,0 +1,522 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_H +#define _BCACHEFS_JOURNAL_H @@ -53144,6 +55292,11 @@ index 000000000000..c39cbbf1bccd + return j->pin.back - 1; +} + ++static inline u64 journal_last_unwritten_seq(struct journal *j) ++{ ++ return j->seq_ondisk + 1; ++} ++ +void bch2_journal_set_has_inum(struct journal *, u64, u64); + +static inline int journal_state_count(union journal_res_state s, int idx) @@ -53264,9 +55417,6 @@ index 000000000000..c39cbbf1bccd + .buf3_count = idx == 3, + }).v, &j->reservations.counter); + -+ EBUG_ON(((s.idx - idx) & 3) > -+ ((s.idx - s.unwritten_idx) & 3)); -+ + if (!journal_state_count(s, idx) && idx == s.unwritten_idx) + __bch2_journal_buf_put(j); +} @@ -53296,9 +55446,9 @@ index 000000000000..c39cbbf1bccd +int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, + unsigned); + -+#define JOURNAL_RES_GET_NONBLOCK (1 << 0) -+#define JOURNAL_RES_GET_CHECK (1 << 1) -+#define JOURNAL_RES_GET_RESERVED (1 << 2) ++/* First two bits for JOURNAL_WATERMARK: */ ++#define JOURNAL_RES_GET_NONBLOCK (1 << 2) ++#define JOURNAL_RES_GET_CHECK (1 << 3) + +static inline int journal_res_get_fast(struct journal *j, + struct journal_res *res, @@ -53319,8 +55469,7 @@ index 000000000000..c39cbbf1bccd + + EBUG_ON(!journal_state_count(new, new.idx)); + -+ if (!(flags & JOURNAL_RES_GET_RESERVED) && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) ++ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) + return 0; + + new.cur_entry_offset += res->u64s; @@ -53373,23 +55522,27 @@ index 000000000000..c39cbbf1bccd + +/* journal_preres: */ + -+static inline bool journal_check_may_get_unreserved(struct journal *j) ++static inline void journal_set_watermark(struct journal *j) +{ + union journal_preres_state s = READ_ONCE(j->prereserved); -+ bool ret = s.reserved < s.remaining && -+ fifo_free(&j->pin) > 8; ++ unsigned watermark = JOURNAL_WATERMARK_any; + -+ lockdep_assert_held(&j->lock); ++ if (fifo_free(&j->pin) < j->pin.size / 4) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); ++ if (fifo_free(&j->pin) < j->pin.size / 8) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + -+ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ if (ret) { -+ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); -+ journal_wake(j); -+ } else { -+ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); -+ } -+ } -+ return ret; ++ if (s.reserved > s.remaining) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); ++ if (!s.remaining) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); ++ ++ if (watermark == j->watermark) ++ return; ++ ++ swap(watermark, j->watermark); ++ if (watermark > j->watermark) ++ journal_wake(j); +} + +static inline void bch2_journal_preres_put(struct journal *j, @@ -53409,12 +55562,8 @@ index 000000000000..c39cbbf1bccd + closure_wake_up(&j->preres_wait); + } + -+ if (s.reserved <= s.remaining && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ spin_lock(&j->lock); -+ journal_check_may_get_unreserved(j); -+ spin_unlock(&j->lock); -+ } ++ if (s.reserved <= s.remaining && j->watermark) ++ journal_set_watermark(j); +} + +int __bch2_journal_preres_get(struct journal *, @@ -53435,8 +55584,7 @@ index 000000000000..c39cbbf1bccd + old.v = new.v = v; + ret = 0; + -+ if ((flags & JOURNAL_RES_GET_RESERVED) || -+ test_bit(JOURNAL_NOCHANGES, &j->flags) || ++ if ((flags & JOURNAL_WATERMARK_reserved) || + new.reserved + d < new.remaining) { + new.reserved += d; + ret = 1; @@ -53480,7 +55628,9 @@ index 000000000000..c39cbbf1bccd + +int bch2_journal_flush_seq(struct journal *, u64); +int bch2_journal_flush(struct journal *); ++bool bch2_journal_noflush_seq(struct journal *, u64); +int bch2_journal_meta(struct journal *); ++int bch2_journal_log_msg(struct journal *, const char *, ...); + +void bch2_journal_halt(struct journal *); + @@ -53504,6 +55654,7 @@ index 000000000000..c39cbbf1bccd +void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); ++bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); @@ -53522,12 +55673,13 @@ index 000000000000..c39cbbf1bccd +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 -index 000000000000..5c8304e05abd +index 000000000000..e61b88930a7f --- /dev/null +++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1554 @@ +@@ -0,0 +1,1700 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" ++#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_io.h" +#include "btree_update_interior.h" @@ -53575,12 +55727,12 @@ index 000000000000..5c8304e05abd + * be replayed: + */ +static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, -+ struct bch_extent_ptr entry_ptr, ++ struct journal_ptr entry_ptr, + struct journal_list *jlist, struct jset *j, + bool bad) +{ + struct journal_replay *i, *pos, *dup = NULL; -+ struct bch_extent_ptr *ptr; ++ struct journal_ptr *ptr; + struct list_head *where; + size_t bytes = vstruct_bytes(j); + u64 last_seq = 0; @@ -53780,14 +55932,15 @@ index 000000000000..5c8304e05abd + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id)); + if (invalid) { -+ char buf[160]; ++ struct printbuf buf = PRINTBUF; + -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s", + type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), -+ invalid, buf); ++ invalid, buf.buf); ++ printbuf_exit(&buf); + + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -53802,7 +55955,7 @@ index 000000000000..5c8304e05abd + return ret; +} + -+static int journal_entry_validate_btree_keys(struct bch_fs *c, ++static int journal_entry_btree_keys_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -53823,7 +55976,24 @@ index 000000000000..5c8304e05abd + return 0; +} + -+static int journal_entry_validate_btree_root(struct bch_fs *c, ++static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct bkey_i *k; ++ bool first = true; ++ ++ vstruct_for_each(entry, k) { ++ if (!first) { ++ pr_newline(out); ++ pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); ++ } ++ pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); ++ first = false; ++ } ++} ++ ++static int journal_entry_btree_root_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -53851,7 +56021,13 @@ index 000000000000..5c8304e05abd + return ret; +} + -+static int journal_entry_validate_prio_ptrs(struct bch_fs *c, ++static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ journal_entry_btree_keys_to_text(out, c, entry); ++} ++ ++static int journal_entry_prio_ptrs_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -53860,7 +56036,12 @@ index 000000000000..5c8304e05abd + return 0; +} + -+static int journal_entry_validate_blacklist(struct bch_fs *c, ++static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++} ++ ++static int journal_entry_blacklist_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -53875,7 +56056,16 @@ index 000000000000..5c8304e05abd + return ret; +} + -+static int journal_entry_validate_blacklist_v2(struct bch_fs *c, ++static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_blacklist *bl = ++ container_of(entry, struct jset_entry_blacklist, entry); ++ ++ pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq)); ++} ++ ++static int journal_entry_blacklist_v2_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -53901,7 +56091,18 @@ index 000000000000..5c8304e05abd + return ret; +} + -+static int journal_entry_validate_usage(struct bch_fs *c, ++static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_blacklist_v2 *bl = ++ container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ pr_buf(out, "start=%llu end=%llu", ++ le64_to_cpu(bl->start), ++ le64_to_cpu(bl->end)); ++} ++ ++static int journal_entry_usage_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -53922,7 +56123,18 @@ index 000000000000..5c8304e05abd + return ret; +} + -+static int journal_entry_validate_data_usage(struct bch_fs *c, ++static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ pr_buf(out, "type=%s v=%llu", ++ bch2_fs_usage_types[u->entry.btree_id], ++ le64_to_cpu(u->v)); ++} ++ ++static int journal_entry_data_usage_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -53944,7 +56156,17 @@ index 000000000000..5c8304e05abd + return ret; +} + -+static int journal_entry_validate_clock(struct bch_fs *c, ++static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ++ bch2_replicas_entry_to_text(out, &u->r); ++ pr_buf(out, "=%llu", le64_to_cpu(u->v)); ++} ++ ++static int journal_entry_clock_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -53970,7 +56192,16 @@ index 000000000000..5c8304e05abd + return ret; +} + -+static int journal_entry_validate_dev_usage(struct bch_fs *c, ++static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_clock *clock = ++ container_of(entry, struct jset_entry_clock, entry); ++ ++ pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); ++} ++ ++static int journal_entry_dev_usage_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) @@ -54007,15 +56238,59 @@ index 000000000000..5c8304e05abd + return ret; +} + ++static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_dev_usage *u = ++ container_of(entry, struct jset_entry_dev_usage, entry); ++ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); ++ ++ pr_buf(out, "dev=%u", le32_to_cpu(u->dev)); ++ ++ for (i = 0; i < nr_types; i++) { ++ if (i < BCH_DATA_NR) ++ pr_buf(out, " %s", bch2_data_types[i]); ++ else ++ pr_buf(out, " (unknown data type %u)", i); ++ pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu", ++ le64_to_cpu(u->d[i].buckets), ++ le64_to_cpu(u->d[i].sectors), ++ le64_to_cpu(u->d[i].fragmented)); ++ } ++ ++ pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu", ++ le64_to_cpu(u->buckets_ec), ++ le64_to_cpu(u->buckets_unavailable)); ++} ++ ++static int journal_entry_log_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ return 0; ++} ++ ++static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); ++ unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); ++ ++ pr_buf(out, "%.*s", bytes, l->d); ++} ++ +struct jset_entry_ops { + int (*validate)(struct bch_fs *, const char *, + struct jset_entry *, unsigned, int, int); ++ void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); +}; + +static const struct jset_entry_ops bch2_jset_entry_ops[] = { +#define x(f, nr) \ + [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ -+ .validate = journal_entry_validate_##f, \ ++ .validate = journal_entry_##f##_validate, \ ++ .to_text = journal_entry_##f##_to_text, \ + }, + BCH_JSET_ENTRY_TYPES() +#undef x @@ -54031,6 +56306,17 @@ index 000000000000..5c8304e05abd + : 0; +} + ++void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ if (entry->type < BCH_JSET_ENTRY_NR) { ++ pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); ++ bch2_jset_entry_ops[entry->type].to_text(out, c, entry); ++ } else { ++ pr_buf(out, "(unknown type %u)", entry->type); ++ } ++} ++ +static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + int write) +{ @@ -54120,9 +56406,11 @@ index 000000000000..5c8304e05abd + sector, le64_to_cpu(jset->seq))) + ret = JOURNAL_ENTRY_BAD; + -+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); ++ bch2_fs_fatal_err_on(ret, c, ++ "error decrypting journal entry: %i", ret); +csum_done: + /* last_seq is ignored when JSET_NO_FLUSH is true */ + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && @@ -54238,7 +56526,7 @@ index 000000000000..5c8304e05abd + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; -+ sectors = c->opts.block_size; ++ sectors = block_sectors(c); + goto next_block; + case JOURNAL_ENTRY_BAD: + saw_bad = true; @@ -54247,7 +56535,7 @@ index 000000000000..5c8304e05abd + * field of the journal entry we read, so try reading + * again at next block boundary: + */ -+ sectors = c->opts.block_size; ++ sectors = block_sectors(c); + break; + default: + return ret; @@ -54265,9 +56553,12 @@ index 000000000000..5c8304e05abd + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + mutex_lock(&jlist->lock); -+ ret = journal_entry_add(c, ca, (struct bch_extent_ptr) { -+ .dev = ca->dev_idx, -+ .offset = offset, ++ ret = journal_entry_add(c, ca, (struct journal_ptr) { ++ .dev = ca->dev_idx, ++ .bucket = bucket, ++ .bucket_offset = offset - ++ bucket_to_sector(ca, ja->buckets[bucket]), ++ .sector = offset, + }, jlist, j, ret != 0); + mutex_unlock(&jlist->lock); + @@ -54294,12 +56585,14 @@ index 000000000000..5c8304e05abd + struct journal_device *ja = + container_of(cl, struct journal_device, read); + struct bch_dev *ca = container_of(ja, struct bch_dev, journal); ++ struct bch_fs *c = ca->fs; + struct journal_list *jlist = + container_of(cl->parent, struct journal_list, cl); ++ struct journal_replay *r; + struct journal_read_buf buf = { NULL, 0 }; + u64 min_seq = U64_MAX; + unsigned i; -+ int ret; ++ int ret = 0; + + if (!ja->nr) + goto out; @@ -54331,11 +56624,37 @@ index 000000000000..5c8304e05abd + * allocate + */ + while (ja->bucket_seq[ja->cur_idx] > min_seq && -+ ja->bucket_seq[ja->cur_idx] > ++ ja->bucket_seq[ja->cur_idx] == + ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + -+ ja->sectors_free = 0; ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ mutex_lock(&jlist->lock); ++ list_for_each_entry(r, jlist->head, list) { ++ for (i = 0; i < r->nr_ptrs; i++) { ++ if (r->ptrs[i].dev == ca->dev_idx && ++ sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) { ++ unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) + ++ vstruct_sectors(&r->j, c->block_bits); ++ ++ ja->sectors_free = min(ja->sectors_free, ++ ca->mi.bucket_size - wrote); ++ } ++ } ++ } ++ mutex_unlock(&jlist->lock); ++ ++ if (ja->bucket_seq[ja->cur_idx] && ++ ja->sectors_free == ca->mi.bucket_size) { ++ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); ++ bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); ++ for (i = 0; i < 3; i++) { ++ unsigned idx = ja->cur_idx - 1 + i; ++ bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); ++ } ++ ja->sectors_free = 0; ++ } + + /* + * Set dirty_idx to indicate the entire journal is full and needs to be @@ -54345,6 +56664,7 @@ index 000000000000..5c8304e05abd + ja->discard_idx = ja->dirty_idx_ondisk = + ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; +out: ++ bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); + kvpfree(buf.data, buf.size); + percpu_ref_put(&ca->io_ref); + closure_return(cl); @@ -54356,8 +56676,8 @@ index 000000000000..5c8304e05abd + goto out; +} + -+static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, -+ struct journal_replay *j) ++void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct journal_replay *j) +{ + unsigned i; + @@ -54365,13 +56685,15 @@ index 000000000000..5c8304e05abd + struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); + u64 offset; + -+ div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); ++ div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); + + if (i) + pr_buf(out, " "); -+ pr_buf(out, "%u:%llu (offset %llu)", ++ pr_buf(out, "%u:%u:%u (sector %llu)", + j->ptrs[i].dev, -+ (u64) j->ptrs[i].offset, offset); ++ j->ptrs[i].bucket, ++ j->ptrs[i].bucket_offset, ++ j->ptrs[i].sector); + } +} + @@ -54382,6 +56704,7 @@ index 000000000000..5c8304e05abd + struct journal_replay *i, *t; + struct bch_dev *ca; + unsigned iter; ++ struct printbuf buf = PRINTBUF; + size_t keys = 0, entries = 0; + bool degraded = false; + u64 seq, last_seq = 0; @@ -54440,7 +56763,8 @@ index 000000000000..5c8304e05abd + + if (!last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); -+ return -1; ++ ret = -1; ++ goto err; + } + + /* Drop blacklisted entries and entries older than last_seq: */ @@ -54472,7 +56796,7 @@ index 000000000000..5c8304e05abd + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; -+ char buf1[200], buf2[200]; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) @@ -54488,14 +56812,13 @@ index 000000000000..5c8304e05abd + seq++; + + if (i->list.prev != list) { -+ struct printbuf out = PBUF(buf1); + struct journal_replay *p = list_prev_entry(i, list); + -+ bch2_journal_ptrs_to_text(&out, c, p); -+ pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits)); ++ bch2_journal_ptrs_to_text(&buf1, c, p); ++ pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits)); + } else -+ sprintf(buf1, "(none)"); -+ bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); ++ pr_buf(&buf1, "(none)"); ++ bch2_journal_ptrs_to_text(&buf2, c, i); + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" @@ -54503,7 +56826,10 @@ index 000000000000..5c8304e05abd + " next at %s", + missing_start, missing_end, + last_seq, *blacklist_seq - 1, -+ buf1, buf2); ++ buf1.buf, buf2.buf); ++ ++ printbuf_exit(&buf1); ++ printbuf_exit(&buf2); + } + + seq++; @@ -54517,14 +56843,13 @@ index 000000000000..5c8304e05abd + .e.nr_required = 1, + }; + unsigned ptr; -+ char buf[80]; + + if (i->ignore) + continue; + + ret = jset_validate_entries(c, &i->j, READ); + if (ret) -+ goto fsck_err; ++ goto err; + + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; @@ -54536,15 +56861,17 @@ index 000000000000..5c8304e05abd + * the devices - this is wrong: + */ + ++ printbuf_reset(&buf); ++ bch2_replicas_entry_to_text(&buf, &replicas.e); ++ + if (!degraded && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, + "superblock not marked as containing replicas %s", -+ (bch2_replicas_entry_to_text(&PBUF(buf), -+ &replicas.e), buf)))) { ++ buf.buf))) { + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) -+ return ret; ++ goto err; + } + + for_each_jset_key(k, _n, entry, &i->j) @@ -54558,7 +56885,9 @@ index 000000000000..5c8304e05abd + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); ++err: +fsck_err: ++ printbuf_exit(&buf); + return ret; +} + @@ -54685,49 +57014,6 @@ index 000000000000..5c8304e05abd + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; +} + -+static void journal_write_compact(struct jset *jset) -+{ -+ struct jset_entry *i, *next, *prev = NULL; -+ -+ /* -+ * Simple compaction, dropping empty jset_entries (from journal -+ * reservations that weren't fully used) and merging jset_entries that -+ * can be. -+ * -+ * If we wanted to be really fancy here, we could sort all the keys in -+ * the jset and drop keys that were overwritten - probably not worth it: -+ */ -+ vstruct_for_each_safe(jset, i, next) { -+ unsigned u64s = le16_to_cpu(i->u64s); -+ -+ /* Empty entry: */ -+ if (!u64s) -+ continue; -+ -+ /* Can we merge with previous entry? */ -+ if (prev && -+ i->btree_id == prev->btree_id && -+ i->level == prev->level && -+ i->type == prev->type && -+ i->type == BCH_JSET_ENTRY_btree_keys && -+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { -+ memmove_u64s_down(vstruct_next(prev), -+ i->_data, -+ u64s); -+ le16_add_cpu(&prev->u64s, u64s); -+ continue; -+ } -+ -+ /* Couldn't merge, move i into new position (after prev): */ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ if (i != prev) -+ memmove_u64s_down(prev, i, jset_u64s(u64s)); -+ } -+ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -+} -+ +static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) +{ + /* we aren't holding j->lock: */ @@ -54753,7 +57039,7 @@ index 000000000000..5c8304e05abd + +static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) +{ -+ return j->buf + j->reservations.unwritten_idx; ++ return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); +} + +static void journal_write_done(struct closure *cl) @@ -54766,7 +57052,9 @@ index 000000000000..5c8304e05abd + u64 v, seq; + int err = 0; + -+ bch2_time_stats_update(j->write_time, j->write_start_time); ++ bch2_time_stats_update(!JSET_NO_FLUSH(w->data) ++ ? j->flush_write_time ++ : j->noflush_write_time, j->write_start_time); + + if (!w->devs_written.nr) { + bch_err(c, "unable to write journal to sufficient devices"); @@ -54788,15 +57076,18 @@ index 000000000000..5c8304e05abd + journal_seq_pin(j, seq)->devs = w->devs_written; + + if (!err) { -+ j->seq_ondisk = seq; -+ + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; ++ ++ bch2_do_discards(c); ++ closure_wake_up(&c->freelist_wait); + } + } else if (!j->err_seq || seq < j->err_seq) + j->err_seq = seq; + ++ j->seq_ondisk = seq; ++ + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: @@ -54812,7 +57103,7 @@ index 000000000000..5c8304e05abd + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; -+ BUG_ON(new.idx == new.unwritten_idx); ++ BUG_ON(journal_state_count(new, new.unwritten_idx)); + + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, @@ -54823,13 +57114,24 @@ index 000000000000..5c8304e05abd + closure_wake_up(&w->wait); + journal_wake(j); + -+ if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) -+ mod_delayed_work(c->io_complete_wq, &j->write_work, 0); -+ spin_unlock(&j->lock); -+ -+ if (new.unwritten_idx != new.idx && -+ !journal_state_count(new, new.unwritten_idx)) ++ if (!journal_state_count(new, new.unwritten_idx) && ++ journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); ++ } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && ++ new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { ++ struct journal_buf *buf = journal_cur_buf(j); ++ long delta = buf->expires - jiffies; ++ ++ /* ++ * We don't close a journal entry to write it while there's ++ * previous entries still in flight - the current journal entry ++ * might want to be written now: ++ */ ++ ++ mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); ++ } ++ ++ spin_unlock(&j->lock); +} + +static void journal_write_endio(struct bio *bio) @@ -54911,7 +57213,7 @@ index 000000000000..5c8304e05abd + struct jset_entry *start, *end; + struct jset *jset; + struct bio *bio; -+ char *journal_debug_buf = NULL; ++ struct printbuf journal_debug_buf = PRINTBUF; + bool validate_before_checksum = false; + unsigned i, sectors, bytes, u64s, nr_rw_members = 0; + int ret; @@ -54924,10 +57226,11 @@ index 000000000000..5c8304e05abd + j->write_start_time = local_clock(); + + spin_lock(&j->lock); -+ if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && -+ !w->must_flush && -+ (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && -+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { ++ if (bch2_journal_error(j) || ++ w->noflush || ++ (!w->must_flush && ++ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && ++ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = 0; @@ -54964,17 +57267,15 @@ index 000000000000..5c8304e05abd + le32_add_cpu(&jset->u64s, u64s); + BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); + -+ journal_write_compact(jset); -+ + jset->magic = cpu_to_le64(jset_magic(c)); -+ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber + ? cpu_to_le32(BCH_JSET_VERSION_OLD) + : cpu_to_le32(c->sb.version); + + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + -+ if (journal_entry_empty(jset)) ++ if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) + j->last_empty_seq = le64_to_cpu(jset->seq); + + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) @@ -54987,9 +57288,12 @@ index 000000000000..5c8304e05abd + jset_validate_for_write(c, jset)) + goto err; + -+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting journal entry: %i", ret)) ++ goto err; + + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); @@ -55014,11 +57318,8 @@ index 000000000000..5c8304e05abd + goto retry_alloc; + } + -+ if (ret) { -+ journal_debug_buf = kmalloc(4096, GFP_ATOMIC); -+ if (journal_debug_buf) -+ __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); -+ } ++ if (ret) ++ __bch2_journal_debug_to_text(&journal_debug_buf, j); + + /* + * write is allocated, no longer need to account for it in @@ -55035,8 +57336,8 @@ index 000000000000..5c8304e05abd + + if (ret) { + bch_err(c, "Unable to allocate journal write:\n%s", -+ journal_debug_buf); -+ kfree(journal_debug_buf); ++ journal_debug_buf.buf); ++ printbuf_exit(&journal_debug_buf); + bch2_fatal_error(c); + continue_at(cl, journal_write_done, c->io_complete_wq); + return; @@ -55044,7 +57345,7 @@ index 000000000000..5c8304e05abd + + w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + -+ if (test_bit(JOURNAL_NOCHANGES, &j->flags)) ++ if (c->opts.nochanges) + goto no_io; + + for_each_rw_member(ca, c, i) @@ -55067,25 +57368,21 @@ index 000000000000..5c8304e05abd + } + } + -+ bch2_bucket_seq_cleanup(c); -+ + continue_at(cl, do_journal_write, c->io_complete_wq); + return; +no_io: -+ bch2_bucket_seq_cleanup(c); -+ + continue_at(cl, journal_write_done, c->io_complete_wq); + return; +err: -+ bch2_inconsistent_error(c); ++ bch2_fatal_error(c); + continue_at(cl, journal_write_done, c->io_complete_wq); +} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h new file mode 100644 -index 000000000000..f34281a28f12 +index 000000000000..f2001835e43e --- /dev/null +++ b/fs/bcachefs/journal_io.h -@@ -0,0 +1,50 @@ +@@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_IO_H +#define _BCACHEFS_JOURNAL_IO_H @@ -55096,7 +57393,12 @@ index 000000000000..f34281a28f12 + */ +struct journal_replay { + struct list_head list; -+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; ++ struct journal_ptr { ++ u8 dev; ++ u32 bucket; ++ u32 bucket_offset; ++ u64 sector; ++ } ptrs[BCH_REPLICAS_MAX]; + unsigned nr_ptrs; + + /* checksum error, but we may want to try using it anyways: */ @@ -55128,8 +57430,13 @@ index 000000000000..f34281a28f12 + for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ + vstruct_for_each_safe(entry, k, _n) + -+int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *, -+ unsigned, int, int); ++int bch2_journal_entry_validate(struct bch_fs *, const char *, ++ struct jset_entry *, unsigned, int, int); ++void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, ++ struct jset_entry *); ++ ++void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct journal_replay *); + +int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); + @@ -55138,10 +57445,10 @@ index 000000000000..f34281a28f12 +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c new file mode 100644 -index 000000000000..ca482c6743c3 +index 000000000000..a9f7d5a7feb2 --- /dev/null +++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,849 @@ +@@ -0,0 +1,847 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -55178,10 +57485,8 @@ index 000000000000..ca482c6743c3 + struct journal_device *ja, + enum journal_space_from from) +{ -+ unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags) -+ ? ((journal_space_from(ja, from) - -+ ja->cur_idx - 1 + ja->nr) % ja->nr) -+ : ja->nr; ++ unsigned available = (journal_space_from(ja, from) - ++ ja->cur_idx - 1 + ja->nr) % ja->nr; + + /* + * Don't use the last bucket unless writing the new last_seq @@ -55205,25 +57510,13 @@ index 000000000000..ca482c6743c3 + old.v, new.v)) != old.v); +} + -+static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) -+{ -+ unsigned sectors = 0; -+ -+ while (!sectors && *idx != j->reservations.idx) { -+ sectors = j->buf[*idx].sectors; -+ -+ *idx = (*idx + 1) & JOURNAL_BUF_MASK; -+ } -+ -+ return sectors; -+} -+ +static struct journal_space +journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) +{ + struct journal_device *ja = &ca->journal; -+ unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx; ++ unsigned sectors, buckets, unwritten; ++ u64 seq; + + if (from == journal_space_total) + return (struct journal_space) { @@ -55238,7 +57531,14 @@ index 000000000000..ca482c6743c3 + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ -+ while ((unwritten = get_unwritten_sectors(j, &idx))) { ++ for (seq = journal_last_unwritten_seq(j); ++ seq <= journal_cur_seq(j); ++ seq++) { ++ unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; ++ ++ if (!unwritten) ++ continue; ++ + /* entry won't fit on this device, skip: */ + if (unwritten > ca->mi.bucket_size) + continue; @@ -55346,7 +57646,7 @@ index 000000000000..ca482c6743c3 + j->can_discard = can_discard; + + if (nr_online < c->opts.metadata_replicas_required) { -+ ret = cur_entry_insufficient_devices; ++ ret = JOURNAL_ERR_insufficient_devices; + goto out; + } + @@ -55360,23 +57660,24 @@ index 000000000000..ca482c6743c3 + total = j->space[journal_space_total].total; + + if (!clean_ondisk && -+ j->reservations.idx == -+ j->reservations.unwritten_idx) { -+ char *buf = kmalloc(4096, GFP_ATOMIC); ++ journal_cur_seq(j) == j->seq_ondisk) { ++ struct printbuf buf = PRINTBUF; + -+ bch_err(c, "journal stuck"); -+ if (buf) { -+ __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); -+ pr_err("\n%s", buf); -+ kfree(buf); -+ } ++ __bch2_journal_debug_to_text(&buf, j); ++ bch_err(c, "journal stuck\n%s", buf.buf); ++ printbuf_exit(&buf); + ++ /* ++ * Hack: bch2_fatal_error() calls bch2_journal_halt() which ++ * takes journal lock: ++ */ ++ spin_unlock(&j->lock); + bch2_fatal_error(c); -+ ret = cur_entry_journal_stuck; ++ spin_lock(&j->lock); ++ ++ ret = JOURNAL_ERR_journal_stuck; + } else if (!j->space[journal_space_discarded].next_entry) -+ ret = cur_entry_journal_full; -+ else if (!fifo_free(&j->pin)) -+ ret = cur_entry_journal_pin_full; ++ ret = JOURNAL_ERR_journal_full; + + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && @@ -55395,7 +57696,7 @@ index 000000000000..ca482c6743c3 + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_error = ret; + journal_set_remaining(j, u64s_remaining); -+ journal_check_may_get_unreserved(j); ++ journal_set_watermark(j); + + if (!ret) + journal_wake(j); @@ -55430,7 +57731,8 @@ index 000000000000..ca482c6743c3 + struct journal_device *ja = &ca->journal; + + while (should_discard_bucket(j, ja)) { -+ if (ca->mi.discard && ++ if (!c->opts.nochanges && ++ ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, @@ -55517,9 +57819,6 @@ index 000000000000..ca482c6743c3 + if (atomic_dec_and_test(&pin_list->count) && + pin_list == &fifo_peek_front(&j->pin)) + bch2_journal_reclaim_fast(j); -+ else if (fifo_used(&j->pin) == 1 && -+ atomic_read(&pin_list->count) == 1) -+ journal_wake(j); +} + +void bch2_journal_pin_drop(struct journal *j, @@ -55633,9 +57932,6 @@ index 000000000000..ca482c6743c3 + u64 seq; + int err; + -+ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) -+ return 0; -+ + lockdep_assert_held(&j->reclaim_lock); + + while (1) { @@ -55781,7 +58077,7 @@ index 000000000000..ca482c6743c3 + * make sure to flush at least one journal pin: + */ + if (time_after(jiffies, j->last_flushed + -+ msecs_to_jiffies(j->reclaim_delay_ms))) ++ msecs_to_jiffies(c->opts.journal_reclaim_delay))) + min_nr = 1; + + if (j->prereserved.reserved * 4 > j->prereserved.remaining) @@ -55815,7 +58111,7 @@ index 000000000000..ca482c6743c3 + + if (nr_flushed) + wake_up(&j->reclaim_wait); -+ } while ((min_nr || min_key_cache) && !direct); ++ } while ((min_nr || min_key_cache) && nr_flushed && !direct); + + memalloc_noreclaim_restore(flags); + @@ -55830,13 +58126,13 @@ index 000000000000..ca482c6743c3 +static int bch2_journal_reclaim_thread(void *arg) +{ + struct journal *j = arg; ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned long delay, now; ++ bool journal_empty; + int ret = 0; + + set_freezable(); + -+ kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); -+ + j->last_flushed = jiffies; + + while (!ret && !kthread_should_stop()) { @@ -55847,7 +58143,7 @@ index 000000000000..ca482c6743c3 + mutex_unlock(&j->reclaim_lock); + + now = jiffies; -+ delay = msecs_to_jiffies(j->reclaim_delay_ms); ++ delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); + j->next_reclaim = j->last_flushed + delay; + + if (!time_in_range(j->next_reclaim, now, now + delay)) @@ -55859,10 +58155,17 @@ index 000000000000..ca482c6743c3 + break; + if (j->reclaim_kicked) + break; -+ if (time_after_eq(jiffies, j->next_reclaim)) -+ break; -+ freezable_schedule_timeout(j->next_reclaim - jiffies); + ++ spin_lock(&j->lock); ++ journal_empty = fifo_empty(&j->pin); ++ spin_unlock(&j->lock); ++ ++ if (journal_empty) ++ freezable_schedule(); ++ else if (time_after(j->next_reclaim, jiffies)) ++ freezable_schedule_timeout(j->next_reclaim - jiffies); ++ else ++ break; + } + __set_current_state(TASK_RUNNING); + } @@ -55914,7 +58217,8 @@ index 000000000000..ca482c6743c3 + + mutex_lock(&j->reclaim_lock); + -+ *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0; ++ if (journal_flush_pins(j, seq_to_flush, 0, 0)) ++ *did_work = true; + + spin_lock(&j->lock); + /* @@ -55923,8 +58227,7 @@ index 000000000000..ca482c6743c3 + */ + ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + journal_last_seq(j) > seq_to_flush || -+ (fifo_used(&j->pin) == 1 && -+ atomic_read(&fifo_peek_front(&j->pin).count) == 1); ++ !fifo_used(&j->pin); + + spin_unlock(&j->lock); + mutex_unlock(&j->reclaim_lock); @@ -55972,10 +58275,12 @@ index 000000000000..ca482c6743c3 + seq = 0; + + spin_lock(&j->lock); -+ while (!ret && seq < j->pin.back) { ++ while (!ret) { + struct bch_replicas_padded replicas; + + seq = max(seq, journal_last_seq(j)); ++ if (seq >= j->pin.back) ++ break; + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + journal_seq_pin(j, seq)->devs); + seq++; @@ -56083,12 +58388,270 @@ index 000000000000..0fd1af120db5 +int bch2_journal_flush_device_pins(struct journal *, int); + +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ +diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c +new file mode 100644 +index 000000000000..8efe7b7e3dcb +--- /dev/null ++++ b/fs/bcachefs/journal_sb.c +@@ -0,0 +1,222 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "journal_sb.h" ++ ++#include ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static int u64_cmp(const void *_l, const void *_r) ++{ ++ const u64 *l = _l; ++ const u64 *r = _r; ++ ++ return cmp_int(*l, *r); ++} ++ ++static int bch2_sb_journal_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ int ret = -EINVAL; ++ unsigned nr; ++ unsigned i; ++ u64 *b; ++ ++ nr = bch2_nr_journal_buckets(journal); ++ if (!nr) ++ return 0; ++ ++ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); ++ if (!b) ++ return -ENOMEM; ++ ++ for (i = 0; i < nr; i++) ++ b[i] = le64_to_cpu(journal->buckets[i]); ++ ++ sort(b, nr, sizeof(u64), u64_cmp, NULL); ++ ++ if (!b[0]) { ++ pr_buf(err, "journal bucket at sector 0"); ++ goto err; ++ } ++ ++ if (b[0] < le16_to_cpu(m->first_bucket)) { ++ pr_buf(err, "journal bucket %llu before first bucket %u", ++ b[0], le16_to_cpu(m->first_bucket)); ++ goto err; ++ } ++ ++ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { ++ pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", ++ b[nr - 1], le64_to_cpu(m->nbuckets)); ++ goto err; ++ } ++ ++ for (i = 0; i + 1 < nr; i++) ++ if (b[i] == b[i + 1]) { ++ pr_buf(err, "duplicate journal buckets %llu", b[i]); ++ goto err; ++ } ++ ++ ret = 0; ++err: ++ kfree(b); ++ return ret; ++} ++ ++static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ unsigned i, nr = bch2_nr_journal_buckets(journal); ++ ++ pr_buf(out, "Buckets: "); ++ for (i = 0; i < nr; i++) ++ pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i])); ++ pr_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal = { ++ .validate = bch2_sb_journal_validate, ++ .to_text = bch2_sb_journal_to_text, ++}; ++ ++struct u64_range { ++ u64 start; ++ u64 end; ++}; ++ ++static int u64_range_cmp(const void *_l, const void *_r) ++{ ++ const struct u64_range *l = _l; ++ const struct u64_range *r = _r; ++ ++ return cmp_int(l->start, r->start); ++} ++ ++static int bch2_sb_journal_v2_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ int ret = -EINVAL; ++ unsigned nr; ++ unsigned i; ++ struct u64_range *b; ++ ++ nr = bch2_sb_field_journal_v2_nr_entries(journal); ++ if (!nr) ++ return 0; ++ ++ b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL); ++ if (!b) ++ return -ENOMEM; ++ ++ for (i = 0; i < nr; i++) { ++ b[i].start = le64_to_cpu(journal->d[i].start); ++ b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); ++ } ++ ++ sort(b, nr, sizeof(*b), u64_range_cmp, NULL); ++ ++ if (!b[0].start) { ++ pr_buf(err, "journal bucket at sector 0"); ++ goto err; ++ } ++ ++ if (b[0].start < le16_to_cpu(m->first_bucket)) { ++ pr_buf(err, "journal bucket %llu before first bucket %u", ++ b[0].start, le16_to_cpu(m->first_bucket)); ++ goto err; ++ } ++ ++ if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { ++ pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", ++ b[nr - 1].end - 1, le64_to_cpu(m->nbuckets)); ++ goto err; ++ } ++ ++ for (i = 0; i + 1 < nr; i++) { ++ if (b[i].end == b[i + 1].start) { ++ pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu", ++ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); ++ goto err; ++ } ++ ++ if (b[i].end > b[i + 1].start) { ++ pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", ++ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); ++ goto err; ++ } ++ } ++ ++ ret = 0; ++err: ++ kfree(b); ++ return ret; ++} ++ ++static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); ++ unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); ++ ++ pr_buf(out, "Buckets: "); ++ for (i = 0; i < nr; i++) ++ pr_buf(out, " %llu-%llu", ++ le64_to_cpu(journal->d[i].start), ++ le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); ++ pr_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { ++ .validate = bch2_sb_journal_v2_validate, ++ .to_text = bch2_sb_journal_v2_to_text, ++}; ++ ++int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal_v2 *j; ++ unsigned i, dst = 0, nr = 1; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ if (!ja->nr) { ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); ++ return 0; ++ } ++ ++ for (i = 0; i + 1 < ja->nr; i++) ++ if (ja->buckets[i] + 1 != ja->buckets[i + 1]) ++ nr++; ++ ++ j = bch2_sb_resize_journal_v2(&ca->disk_sb, ++ (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); ++ if (!j) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); ++ ++ j->d[dst].start = le64_to_cpu(ja->buckets[0]); ++ j->d[dst].nr = le64_to_cpu(1); ++ ++ for (i = 1; i < ja->nr; i++) { ++ if (ja->buckets[i] == ja->buckets[i - 1] + 1) { ++ le64_add_cpu(&j->d[dst].nr, 1); ++ } else { ++ dst++; ++ j->d[dst].start = le64_to_cpu(ja->buckets[i]); ++ j->d[dst].nr = le64_to_cpu(1); ++ } ++ } ++ ++ return 0; ++} +diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h +new file mode 100644 +index 000000000000..a39192e9f6f4 +--- /dev/null ++++ b/fs/bcachefs/journal_sb.h +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#include "super-io.h" ++#include "vstructs.h" ++ ++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) ++{ ++ return j ++ ? (__le64 *) vstruct_end(&j->field) - j->buckets ++ : 0; ++} ++ ++static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) ++{ ++ if (!j) ++ return 0; ++ ++ return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; ++} ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal; ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; ++ ++int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c new file mode 100644 -index 000000000000..79bc0e49389b +index 000000000000..3140c8731431 --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.c -@@ -0,0 +1,315 @@ +@@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -56157,6 +58720,12 @@ index 000000000000..79bc0e49389b + return bl; +} + ++static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, ++ u64 start, u64 end) ++{ ++ return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); ++} ++ +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) +{ + struct bch_sb_field_journal_seq_blacklist *bl; @@ -56167,28 +58736,21 @@ index 000000000000..79bc0e49389b + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + nr = blacklist_nr_entries(bl); + -+ if (bl) { -+ for (i = 0; i < nr; i++) { -+ struct journal_seq_blacklist_entry *e = -+ bl->start + i; ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = ++ bl->start + i; + -+ if (start == le64_to_cpu(e->start) && -+ end == le64_to_cpu(e->end)) -+ goto out; ++ if (bl_entry_contig_or_overlaps(e, start, end)) { ++ e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); ++ e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); + -+ if (start <= le64_to_cpu(e->start) && -+ end >= le64_to_cpu(e->end)) { -+ e->start = cpu_to_le64(start); -+ e->end = cpu_to_le64(end); -+ -+ if (i + 1 < nr) -+ bl = blacklist_entry_try_merge(c, -+ bl, i); -+ if (i) -+ bl = blacklist_entry_try_merge(c, -+ bl, i - 1); -+ goto out_write_sb; -+ } ++ if (i + 1 < nr) ++ bl = blacklist_entry_try_merge(c, ++ bl, i); ++ if (i) ++ bl = blacklist_entry_try_merge(c, ++ bl, i - 1); ++ goto out_write_sb; + } + } + @@ -56280,27 +58842,34 @@ index 000000000000..79bc0e49389b + return 0; +} + -+static const char * -+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) ++static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); -+ struct journal_seq_blacklist_entry *i; -+ unsigned nr = blacklist_nr_entries(bl); ++ unsigned i, nr = blacklist_nr_entries(bl); + -+ for (i = bl->start; i < bl->start + nr; i++) { -+ if (le64_to_cpu(i->start) >= -+ le64_to_cpu(i->end)) -+ return "entry start >= end"; ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = bl->start + i; + -+ if (i + 1 < bl->start + nr && -+ le64_to_cpu(i[0].end) > -+ le64_to_cpu(i[1].start)) -+ return "entries out of order"; ++ if (le64_to_cpu(e->start) >= ++ le64_to_cpu(e->end)) { ++ pr_buf(err, "entry %u start >= end (%llu >= %llu)", ++ i, le64_to_cpu(e->start), le64_to_cpu(e->end)); ++ return -EINVAL; ++ } ++ ++ if (i + 1 < nr && ++ le64_to_cpu(e[0].end) > ++ le64_to_cpu(e[1].start)) { ++ pr_buf(err, "entry %u out of order with next entry (%llu > %llu)", ++ i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); ++ return -EINVAL; ++ } + } + -+ return NULL; ++ return 0; +} + +static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, @@ -56320,6 +58889,7 @@ index 000000000000..79bc0e49389b + le64_to_cpu(i->start), + le64_to_cpu(i->end)); + } ++ pr_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { @@ -56434,10 +59004,10 @@ index 000000000000..afb886ec8e25 +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h new file mode 100644 -index 000000000000..d484513289aa +index 000000000000..a6cdb885ad41 --- /dev/null +++ b/fs/bcachefs/journal_types.h -@@ -0,0 +1,324 @@ +@@ -0,0 +1,340 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_TYPES_H +#define _BCACHEFS_JOURNAL_TYPES_H @@ -56465,6 +59035,8 @@ index 000000000000..d484513289aa + + struct closure_waitlist wait; + u64 last_seq; /* copy of data->last_seq */ ++ long expires; ++ u64 flush_time; + + unsigned buf_size; /* size in bytes of @data */ + unsigned sectors; /* maximum size for current entry */ @@ -56579,20 +59151,39 @@ index 000000000000..d484513289aa + journal_space_nr, +}; + -+/* -+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, -+ * either because something's waiting on the write to complete or because it's -+ * been dirty too long and the timer's expired. -+ */ -+ +enum { + JOURNAL_REPLAY_DONE, + JOURNAL_STARTED, -+ JOURNAL_RECLAIM_STARTED, -+ JOURNAL_NEED_WRITE, -+ JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, -+ JOURNAL_NOCHANGES, ++}; ++ ++#define JOURNAL_WATERMARKS() \ ++ x(any) \ ++ x(copygc) \ ++ x(reserved) ++ ++enum journal_watermark { ++#define x(n) JOURNAL_WATERMARK_##n, ++ JOURNAL_WATERMARKS() ++#undef x ++}; ++ ++#define JOURNAL_WATERMARK_MASK 3 ++ ++/* Reasons we may fail to get a journal reservation: */ ++#define JOURNAL_ERRORS() \ ++ x(ok) \ ++ x(blocked) \ ++ x(max_in_flight) \ ++ x(journal_full) \ ++ x(journal_pin_full) \ ++ x(journal_stuck) \ ++ x(insufficient_devices) ++ ++enum journal_errors { ++#define x(n) JOURNAL_ERR_##n, ++ JOURNAL_ERRORS() ++#undef x +}; + +/* Embedded in struct bch_fs */ @@ -56602,6 +59193,7 @@ index 000000000000..d484513289aa + unsigned long flags; + + union journal_res_state reservations; ++ enum journal_watermark watermark; + + /* Max size of current journal entry */ + unsigned cur_entry_u64s; @@ -56611,14 +59203,7 @@ index 000000000000..d484513289aa + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ -+ enum { -+ cur_entry_ok, -+ cur_entry_blocked, -+ cur_entry_journal_full, -+ cur_entry_journal_pin_full, -+ cur_entry_journal_stuck, -+ cur_entry_insufficient_devices, -+ } cur_entry_error; ++ enum journal_errors cur_entry_error; + + union journal_preres_state prereserved; + @@ -56686,6 +59271,10 @@ index 000000000000..d484513289aa + spinlock_t err_lock; + + struct mutex reclaim_lock; ++ /* ++ * Used for waiting until journal reclaim has freed up space in the ++ * journal: ++ */ + wait_queue_head_t reclaim_wait; + struct task_struct *reclaim_thread; + bool reclaim_kicked; @@ -56702,19 +59291,16 @@ index 000000000000..d484513289aa + struct mutex discard_lock; + bool can_discard; + -+ unsigned write_delay_ms; -+ unsigned reclaim_delay_ms; + unsigned long last_flush_write; + + u64 res_get_blocked_start; -+ u64 need_write_time; + u64 write_start_time; + + u64 nr_flush_writes; + u64 nr_noflush_writes; + -+ struct time_stats *write_time; -+ struct time_stats *delay_time; ++ struct time_stats *flush_write_time; ++ struct time_stats *noflush_write_time; + struct time_stats *blocked_time; + struct time_stats *flush_seq_time; + @@ -56939,6 +59525,238 @@ index 000000000000..4b3ff7d8a875 +}; + +#endif /* _BCACHEFS_KEYLIST_TYPES_H */ +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +new file mode 100644 +index 000000000000..4f0e6960e597 +--- /dev/null ++++ b/fs/bcachefs/lru.c +@@ -0,0 +1,203 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "error.h" ++#include "lru.h" ++#include "recovery.h" ++ ++const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct bch_lru *lru = bkey_s_c_to_lru(k).v; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*lru)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bch_lru *lru = bkey_s_c_to_lru(k).v; ++ ++ pr_buf(out, "idx %llu", le64_to_cpu(lru->idx)); ++} ++ ++static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 existing_idx; ++ int ret = 0; ++ ++ if (!time) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, ++ POS(id, time), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_lru) { ++ bch2_fs_inconsistent(c, ++ "pointer to nonexistent lru %llu:%llu", ++ id, time); ++ ret = -EIO; ++ goto err; ++ } ++ ++ existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); ++ if (existing_idx != idx) { ++ bch2_fs_inconsistent(c, ++ "lru %llu:%llu with wrong backpointer: got %llu, should be %llu", ++ id, time, existing_idx, idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_lru *lru; ++ int ret = 0; ++ ++ if (!*time) ++ return 0; ++ ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_lru, ++ POS(lru_id, *time), ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES, k, ret) ++ if (bkey_deleted(k.k)) ++ break; ++ ++ if (ret) ++ goto err; ++ ++ BUG_ON(iter.pos.inode != lru_id); ++ *time = iter.pos.offset; ++ ++ lru = bch2_trans_kmalloc(trans, sizeof(*lru)); ++ ret = PTR_ERR_OR_ZERO(lru); ++ if (ret) ++ goto err; ++ ++ bkey_lru_init(&lru->k_i); ++ lru->k.p = iter.pos; ++ lru->v.idx = cpu_to_le64(idx); ++ ++ ret = bch2_trans_update(trans, &iter, &lru->k_i, 0); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx, ++ u64 old_time, u64 *new_time) ++{ ++ if (old_time == *new_time) ++ return 0; ++ ++ return lru_delete(trans, id, idx, old_time) ?: ++ lru_set(trans, id, idx, new_time); ++} ++ ++static int bch2_check_lru_key(struct btree_trans *trans, ++ struct btree_iter *lru_iter, bool initial) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c lru_k, k; ++ struct bch_alloc_v4 a; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ u64 idx; ++ int ret; ++ ++ lru_k = bch2_btree_iter_peek(lru_iter); ++ if (!lru_k.k) ++ return 0; ++ ++ ret = bkey_err(lru_k); ++ if (ret) ++ return ret; ++ ++ idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx); ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ POS(lru_k.k->p.inode, idx), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (fsck_err_on(bucket_state(a) != BUCKET_cached || ++ a.io_time[READ] != lru_k.k->p.offset, c, ++ "incorrect lru entry %s\n" ++ " for %s", ++ (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), ++ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.p = lru_iter->pos; ++ ++ ret = bch2_trans_update(trans, lru_iter, update, 0); ++ if (ret) ++ goto err; ++ } ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++int bch2_check_lrus(struct bch_fs *c, bool initial) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_check_lru_key(&trans, &iter, initial)); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++ ++} +diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h +new file mode 100644 +index 000000000000..4db6a8399332 +--- /dev/null ++++ b/fs/bcachefs/lru.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_LRU_H ++#define _BCACHEFS_LRU_H ++ ++const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_lru (struct bkey_ops) { \ ++ .key_invalid = bch2_lru_invalid, \ ++ .val_to_text = bch2_lru_to_text, \ ++} ++ ++int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *); ++ ++int bch2_check_lrus(struct bch_fs *, bool); ++ ++#endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c new file mode 100644 index 000000000000..6defc33322b3 @@ -57156,10 +59974,10 @@ index 000000000000..027efaa0d575 +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c new file mode 100644 -index 000000000000..64e39c10e34b +index 000000000000..1de213506adf --- /dev/null +++ b/fs/bcachefs/move.c -@@ -0,0 +1,1124 @@ +@@ -0,0 +1,1130 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -57254,10 +60072,10 @@ index 000000000000..64e39c10e34b + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { + struct bkey_i *update; -+ size_t i; ++ u32 *i; + -+ for (i = 0; i < s.nr; i++) -+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) ++ darray_for_each(s.ids, i) ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i)) + goto next; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); @@ -57287,7 +60105,7 @@ index 000000000000..64e39c10e34b + } + } + bch2_trans_iter_exit(trans, &iter); -+ kfree(s.d); ++ darray_exit(s.ids); + + return ret; +} @@ -57513,8 +60331,7 @@ index 000000000000..64e39c10e34b + } + + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { -+ m->op.alloc_reserve = RESERVE_MOVINGGC; -+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; ++ m->op.alloc_reserve = RESERVE_movinggc; + } else { + /* XXX: this should probably be passed in */ + m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; @@ -57556,10 +60373,14 @@ index 000000000000..64e39c10e34b + unsigned compressed_sectors = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev == data_opts.rewrite_dev && -+ !p.ptr.cached && -+ crc_is_compressed(p.crc)) -+ compressed_sectors += p.crc.compressed_size; ++ if (p.ptr.dev == data_opts.rewrite_dev) { ++ if (p.ptr.cached) ++ m->op.flags |= BCH_WRITE_CACHED; ++ ++ if (!p.ptr.cached && ++ crc_is_compressed(p.crc)) ++ compressed_sectors += p.crc.compressed_size; ++ } + + if (compressed_sectors) { + ret = bch2_disk_reservation_add(c, &m->op.res, @@ -57639,25 +60460,26 @@ index 000000000000..64e39c10e34b + atomic_sub(io->read_sectors, &ctxt->read_sectors); + io->read_completed = true; + -+ if (next_pending_write(ctxt)) -+ wake_up(&ctxt->wait); -+ ++ wake_up(&ctxt->wait); + closure_put(&ctxt->cl); +} + -+static void do_pending_writes(struct moving_context *ctxt) ++static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) +{ + struct moving_io *io; + ++ if (trans) ++ bch2_trans_unlock(trans); ++ + while ((io = next_pending_write(ctxt))) { + list_del(&io->list); + closure_call(&io->cl, move_write, NULL, &ctxt->cl); + } +} + -+#define move_ctxt_wait_event(_ctxt, _cond) \ ++#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ +do { \ -+ do_pending_writes(_ctxt); \ ++ do_pending_writes(_ctxt, _trans); \ + \ + if (_cond) \ + break; \ @@ -57665,11 +60487,12 @@ index 000000000000..64e39c10e34b + next_pending_write(_ctxt) || (_cond)); \ +} while (1) + -+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) ++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, ++ struct btree_trans *trans) +{ + unsigned sectors_pending = atomic_read(&ctxt->write_sectors); + -+ move_ctxt_wait_event(ctxt, ++ move_ctxt_wait_event(ctxt, trans, + !atomic_read(&ctxt->write_sectors) || + atomic_read(&ctxt->write_sectors) != sectors_pending); +} @@ -57691,14 +60514,6 @@ index 000000000000..64e39c10e34b + unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + -+ move_ctxt_wait_event(ctxt, -+ atomic_read(&ctxt->write_sectors) < -+ SECTORS_IN_FLIGHT_PER_DEVICE); -+ -+ move_ctxt_wait_event(ctxt, -+ atomic_read(&ctxt->read_sectors) < -+ SECTORS_IN_FLIGHT_PER_DEVICE); -+ + /* write path might have to decompress data: */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); @@ -57849,26 +60664,36 @@ index 000000000000..64e39c10e34b + schedule_timeout(delay); + + if (unlikely(freezing(current))) { -+ bch2_trans_unlock(&trans); -+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); ++ move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads)); + try_to_freeze(); + } + } while (delay); + ++ move_ctxt_wait_event(ctxt, &trans, ++ atomic_read(&ctxt->write_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ move_ctxt_wait_event(ctxt, &trans, ++ atomic_read(&ctxt->read_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ + bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek(&iter); -+ -+ stats->pos = iter.pos; -+ + if (!k.k) + break; ++ + ret = bkey_err(k); ++ if (ret == -EINTR) ++ continue; + if (ret) + break; ++ + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + ++ stats->pos = iter.pos; ++ + if (!bkey_extent_is_direct_data(k.k)) + goto next_nondata; + @@ -57903,22 +60728,22 @@ index 000000000000..64e39c10e34b + BUG(); + } + -+ /* unlock before doing IO: */ ++ /* ++ * The iterator gets unlocked by __bch2_read_extent - need to ++ * save a copy of @k elsewhere: ++ */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); -+ bch2_trans_unlock(&trans); + + ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, + data_cmd, data_opts); + if (ret2) { -+ if (ret2 == -EINTR) { -+ bch2_trans_begin(&trans); ++ if (ret2 == -EINTR) + continue; -+ } + + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt); ++ bch2_move_ctxt_wait_for_io(ctxt, &trans); + continue; + } + @@ -57929,8 +60754,7 @@ index 000000000000..64e39c10e34b + if (rate) + bch2_ratelimit_increment(rate, k.k->size); +next: -+ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), -+ &stats->sectors_seen); ++ atomic64_add(k.k->size, &stats->sectors_seen); +next_nondata: + bch2_btree_iter_advance(&iter); + } @@ -58004,7 +60828,7 @@ index 000000000000..64e39c10e34b + } + + -+ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); ++ move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); + closure_sync(&ctxt.cl); + + EBUG_ON(atomic_read(&ctxt.write_sectors)); @@ -58390,10 +61214,10 @@ index 000000000000..9df6d18137a5 +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c new file mode 100644 -index 000000000000..5c9eafc026c9 +index 000000000000..cb6b81678ecc --- /dev/null +++ b/fs/bcachefs/movinggc.c -@@ -0,0 +1,385 @@ +@@ -0,0 +1,424 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Moving/copying garbage collector @@ -58402,6 +61226,7 @@ index 000000000000..5c9eafc026c9 + */ + +#include "bcachefs.h" ++#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" @@ -58425,21 +61250,6 @@ index 000000000000..5c9eafc026c9 +#include +#include + -+/* -+ * We can't use the entire copygc reserve in one iteration of copygc: we may -+ * need the buckets we're freeing up to go back into the copygc reserve to make -+ * forward progress, but if the copygc reserve is full they'll be available for -+ * any allocation - and it's possible that in a given iteration, we free up most -+ * of the buckets we're going to free before we allocate most of the buckets -+ * we're going to allocate. -+ * -+ * If we only use half of the reserve per iteration, then in steady state we'll -+ * always have room in the reserve for the buckets we're going to need in the -+ * next iteration: -+ */ -+#define COPYGC_BUCKETS_PER_ITER(ca) \ -+ ((ca)->free[RESERVE_MOVINGGC].size / 2) -+ +static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) +{ + const struct copygc_heap_entry *l = _l; @@ -58465,10 +61275,14 @@ index 000000000000..5c9eafc026c9 + .dev = p.ptr.dev, + .offset = p.ptr.offset, + }; ++ ssize_t i; + -+ ssize_t i = eytzinger0_find_le(h->data, h->used, -+ sizeof(h->data[0]), -+ bucket_offset_cmp, &search); ++ if (p.ptr.cached) ++ continue; ++ ++ i = eytzinger0_find_le(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, &search); +#if 0 + /* eytzinger search verify code: */ + ssize_t j = -1, k; @@ -58497,7 +61311,7 @@ index 000000000000..5c9eafc026c9 + data_opts->target = io_opts->background_target; + data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_JOURNAL_RESERVED; ++ JOURNAL_WATERMARK_copygc; + data_opts->rewrite_dev = p.ptr.dev; + + if (p.has_ec) @@ -58510,18 +61324,6 @@ index 000000000000..5c9eafc026c9 + return DATA_SKIP; +} + -+static bool have_copygc_reserve(struct bch_dev *ca) -+{ -+ bool ret; -+ -+ spin_lock(&ca->fs->freelist_lock); -+ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || -+ ca->allocator_state != ALLOCATOR_running; -+ spin_unlock(&ca->fs->freelist_lock); -+ -+ return ret; -+} -+ +static inline int fragmentation_cmp(copygc_heap *heap, + struct copygc_heap_entry l, + struct copygc_heap_entry r) @@ -58529,18 +61331,106 @@ index 000000000000..5c9eafc026c9 + return cmp_int(l.fragmentation, r.fragmentation); +} + ++static int walk_buckets_to_copygc(struct bch_fs *c) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_alloc_v4 a; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); ++ struct copygc_heap_entry e; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (a.data_type != BCH_DATA_user || ++ a.dirty_sectors >= ca->mi.bucket_size || ++ bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) ++ continue; ++ ++ e = (struct copygc_heap_entry) { ++ .dev = iter.pos.inode, ++ .gen = a.gen, ++ .replicas = 1 + a.stripe_redundancy, ++ .fragmentation = (u64) a.dirty_sectors * (1ULL << 31) ++ / ca->mi.bucket_size, ++ .sectors = a.dirty_sectors, ++ .offset = bucket_to_sector(ca, iter.pos.offset), ++ }; ++ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); ++ ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int bucket_inorder_cmp(const void *_l, const void *_r) ++{ ++ const struct copygc_heap_entry *l = _l; ++ const struct copygc_heap_entry *r = _r; ++ ++ return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset); ++} ++ ++static int check_copygc_was_done(struct bch_fs *c, ++ u64 *sectors_not_moved, ++ u64 *buckets_not_moved) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_alloc_v4 a; ++ struct copygc_heap_entry *i; ++ int ret = 0; ++ ++ sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0); ++ ++ for (i = h->data; i < h->data + h->used; i++) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); ++ ++ bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset))); ++ ++ ret = lockrestart_do(&trans, ++ bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ if (ret) ++ break; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (a.gen == i->gen && a.dirty_sectors) { ++ *sectors_not_moved += a.dirty_sectors; ++ *buckets_not_moved += 1; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ +static int bch2_copygc(struct bch_fs *c) +{ + copygc_heap *h = &c->copygc_heap; + struct copygc_heap_entry e, *i; -+ struct bucket_array *buckets; + struct bch_move_stats move_stats; -+ u64 sectors_to_move = 0, sectors_not_moved = 0; ++ u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; + u64 sectors_reserved = 0; + u64 buckets_to_move, buckets_not_moved = 0; + struct bch_dev *ca; + unsigned dev_idx; -+ size_t b, heap_size = 0; ++ size_t heap_size = 0; + int ret; + + bch_move_stats_init(&move_stats, "copygc"); @@ -58565,64 +61455,49 @@ index 000000000000..5c9eafc026c9 + } + + for_each_rw_member(ca, c, dev_idx) { -+ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); ++ s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc), ++ ca->mi.nbuckets >> 6); + -+ spin_lock(&ca->fs->freelist_lock); -+ sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; -+ spin_unlock(&ca->fs->freelist_lock); -+ -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { -+ struct bucket *g = buckets->b + b; -+ struct bucket_mark m = READ_ONCE(g->mark); -+ struct copygc_heap_entry e; -+ -+ if (m.owned_by_allocator || -+ m.data_type != BCH_DATA_user || -+ !bucket_sectors_used(m) || -+ bucket_sectors_used(m) >= ca->mi.bucket_size) -+ continue; -+ -+ WARN_ON(m.stripe && !g->stripe_redundancy); -+ -+ e = (struct copygc_heap_entry) { -+ .dev = dev_idx, -+ .gen = m.gen, -+ .replicas = 1 + g->stripe_redundancy, -+ .fragmentation = bucket_sectors_used(m) * (1U << 15) -+ / ca->mi.bucket_size, -+ .sectors = bucket_sectors_used(m), -+ .offset = bucket_to_sector(ca, b), -+ }; -+ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); -+ } -+ up_read(&ca->bucket_lock); ++ sectors_reserved += avail * ca->mi.bucket_size; + } + ++ ret = walk_buckets_to_copygc(c); ++ if (ret) { ++ bch2_fs_fatal_error(c, "error walking buckets to copygc!"); ++ return ret; ++ } ++ ++ if (!h->used) { ++ bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!"); ++ return 0; ++ } ++ ++ /* ++ * Our btree node allocations also come out of RESERVE_movingc: ++ */ ++ sectors_reserved = (sectors_reserved * 3) / 4; + if (!sectors_reserved) { + bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); + return -1; + } + -+ /* -+ * Our btree node allocations also come out of RESERVE_MOVINGGC: -+ */ -+ sectors_to_move = (sectors_to_move * 3) / 4; ++ for (i = h->data; i < h->data + h->used; i++) { ++ sectors_to_move += i->sectors; ++ sectors_to_write += i->sectors * i->replicas; ++ } + -+ for (i = h->data; i < h->data + h->used; i++) -+ sectors_to_move += i->sectors * i->replicas; -+ -+ while (sectors_to_move > sectors_reserved) { ++ while (sectors_to_write > sectors_reserved) { + BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); -+ sectors_to_move -= e.sectors * e.replicas; ++ sectors_to_write -= e.sectors * e.replicas; + } + + buckets_to_move = h->used; + -+ if (!buckets_to_move) ++ if (!buckets_to_move) { ++ bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!", ++ sectors_reserved); + return 0; ++ } + + eytzinger0_sort(h->data, h->used, + sizeof(h->data[0]), @@ -58635,30 +61510,18 @@ index 000000000000..5c9eafc026c9 + writepoint_ptr(&c->copygc_write_point), + copygc_pred, NULL, + &move_stats); -+ -+ for_each_rw_member(ca, c, dev_idx) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ for (i = h->data; i < h->data + h->used; i++) { -+ struct bucket_mark m; -+ size_t b; -+ -+ if (i->dev != dev_idx) -+ continue; -+ -+ b = sector_to_bucket(ca, i->offset); -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (i->gen == m.gen && -+ bucket_sectors_used(m)) { -+ sectors_not_moved += bucket_sectors_used(m); -+ buckets_not_moved++; -+ } -+ } -+ up_read(&ca->bucket_lock); ++ if (ret) { ++ bch_err(c, "error %i from bch2_move_data() in copygc", ret); ++ return ret; + } + -+ if (sectors_not_moved && !ret) ++ ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); ++ if (ret) { ++ bch_err(c, "error %i from check_copygc_was_done()", ret); ++ return ret; ++ } ++ ++ if (sectors_not_moved) + bch_warn_ratelimited(c, + "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", + sectors_not_moved, sectors_to_move, @@ -58696,8 +61559,8 @@ index 000000000000..5c9eafc026c9 + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + -+ fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) * -+ ca->mi.bucket_size) >> 1); ++ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * ++ ca->mi.bucket_size) >> 1); + fragmented = usage.d[BCH_DATA_user].fragmented; + + wait = min(wait, max(0LL, fragmented_allowed - fragmented)); @@ -58796,10 +61659,10 @@ index 000000000000..922738247d03 +#endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c new file mode 100644 -index 000000000000..a955ef2008c9 +index 000000000000..77fbb7d2194e --- /dev/null +++ b/fs/bcachefs/opts.c -@@ -0,0 +1,470 @@ +@@ -0,0 +1,560 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include @@ -58811,7 +61674,12 @@ index 000000000000..a955ef2008c9 +#include "super-io.h" +#include "util.h" + -+#define x(t, n) #t, ++#define x(t, n) [n] = #t, ++ ++const char * const bch2_metadata_versions[] = { ++ BCH_METADATA_VERSIONS() ++ NULL ++}; + +const char * const bch2_error_actions[] = { + BCH_ERROR_ACTIONS() @@ -58868,13 +61736,18 @@ index 000000000000..a955ef2008c9 + NULL +}; + -+const char * const bch2_cache_replacement_policies[] = { -+ BCH_CACHE_REPLACEMENT_POLICIES() ++const char * const bch2_member_states[] = { ++ BCH_MEMBER_STATES() + NULL +}; + -+const char * const bch2_member_states[] = { -+ BCH_MEMBER_STATES() ++const char * const bch2_jset_entry_types[] = { ++ BCH_JSET_ENTRY_TYPES() ++ NULL ++}; ++ ++const char * const bch2_fs_usage_types[] = { ++ BCH_FS_USAGE_TYPES() + NULL +}; + @@ -58893,6 +61766,16 @@ index 000000000000..a955ef2008c9 + [DT_SUBVOL] = "subvol", +}; + ++u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) ++{ ++ BUG(); ++} ++ ++void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) ++{ ++ BUG(); ++} ++ +void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) +{ +#define x(_name, ...) \ @@ -58943,41 +61826,27 @@ index 000000000000..a955ef2008c9 + } +} + -+/* -+ * Initial options from superblock - here we don't want any options undefined, -+ * any options the superblock doesn't specify are set to 0: -+ */ -+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ -+#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ -+ if (_sb_opt != NO_SB_OPT) \ -+ opt_set(opts, _name, _sb_opt(sb)); -+ BCH_OPTS() -+#undef x -+ -+ return opts; -+} -+ +const struct bch_option bch2_opt_table[] = { -+#define OPT_BOOL() .type = BCH_OPT_BOOL -+#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max -+#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max -+#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices ++#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 ++#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ ++ .min = _min, .max = _max ++#define OPT_STR(_choices) .type = BCH_OPT_STR, \ ++ .min = 0, .max = ARRAY_SIZE(_choices),\ ++ .choices = _choices +#define OPT_FN(_fn) .type = BCH_OPT_FN, \ + .parse = _fn##_parse, \ + .to_text = _fn##_to_text + -+#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ ++#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ + [Opt_##_name] = { \ + .attr = { \ + .name = #_name, \ -+ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ ++ .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ + }, \ -+ .mode = _mode, \ ++ .flags = _flags, \ + .hint = _hint, \ + .help = _help, \ ++ .get_sb = _sb_opt, \ + .set_sb = SET_##_sb_opt, \ + _type \ + }, @@ -59020,8 +61889,43 @@ index 000000000000..a955ef2008c9 + return bch2_opt_lookup(name); +} + -+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, -+ const char *val, u64 *res) ++int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) ++{ ++ if (v < opt->min) { ++ if (err) ++ pr_buf(err, "%s: too small (min %llu)", ++ opt->attr.name, opt->min); ++ return -ERANGE; ++ } ++ ++ if (opt->max && v >= opt->max) { ++ if (err) ++ pr_buf(err, "%s: too big (max %llu)", ++ opt->attr.name, opt->max); ++ return -ERANGE; ++ } ++ ++ if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { ++ if (err) ++ pr_buf(err, "%s: not a multiple of 512", ++ opt->attr.name); ++ return -EINVAL; ++ } ++ ++ if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { ++ if (err) ++ pr_buf(err, "%s: must be a power of two", ++ opt->attr.name); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_opt_parse(struct bch_fs *c, ++ const struct bch_option *opt, ++ const char *val, u64 *res, ++ struct printbuf *err) +{ + ssize_t ret; + @@ -59030,30 +61934,13 @@ index 000000000000..a955ef2008c9 + ret = kstrtou64(val, 10, res); + if (ret < 0) + return ret; -+ -+ if (*res > 1) -+ return -ERANGE; + break; + case BCH_OPT_UINT: -+ ret = kstrtou64(val, 10, res); ++ ret = opt->flags & OPT_HUMAN_READABLE ++ ? bch2_strtou64_h(val, res) ++ : kstrtou64(val, 10, res); + if (ret < 0) + return ret; -+ -+ if (*res < opt->min || *res >= opt->max) -+ return -ERANGE; -+ break; -+ case BCH_OPT_SECTORS: -+ ret = bch2_strtou64_h(val, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res & 511) -+ return -EINVAL; -+ -+ *res >>= 9; -+ -+ if (*res < opt->min || *res >= opt->max) -+ return -ERANGE; + break; + case BCH_OPT_STR: + ret = match_string(opt->choices, -1, val); @@ -59066,13 +61953,16 @@ index 000000000000..a955ef2008c9 + if (!c) + return 0; + -+ return opt->parse(c, val, res); ++ ret = opt->parse(c, val, res); ++ if (ret < 0) ++ return ret; + } + -+ return 0; ++ return bch2_opt_validate(opt, *res, err); +} + -+void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, ++void bch2_opt_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bch_sb *sb, + const struct bch_option *opt, u64 v, + unsigned flags) +{ @@ -59090,10 +61980,10 @@ index 000000000000..a955ef2008c9 + switch (opt->type) { + case BCH_OPT_BOOL: + case BCH_OPT_UINT: -+ pr_buf(out, "%lli", v); -+ break; -+ case BCH_OPT_SECTORS: -+ bch2_hprint(out, v); ++ if (opt->flags & OPT_HUMAN_READABLE) ++ bch2_hprint(out, v); ++ else ++ pr_buf(out, "%lli", v); + break; + case BCH_OPT_STR: + if (flags & OPT_SHOW_FULL_LIST) @@ -59102,7 +61992,7 @@ index 000000000000..a955ef2008c9 + pr_buf(out, opt->choices[v]); + break; + case BCH_OPT_FN: -+ opt->to_text(out, c, v); ++ opt->to_text(out, c, sb, v); + break; + default: + BUG(); @@ -59148,6 +62038,7 @@ index 000000000000..a955ef2008c9 + char *copied_opts, *copied_opts_start; + char *opt, *name, *val; + int ret, id; ++ struct printbuf err = PRINTBUF; + u64 v; + + if (!options) @@ -59167,7 +62058,7 @@ index 000000000000..a955ef2008c9 + if (id < 0) + goto bad_opt; + -+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v); ++ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + if (ret < 0) + goto bad_val; + } else { @@ -59187,7 +62078,7 @@ index 000000000000..a955ef2008c9 + goto no_val; + } + -+ if (!(bch2_opt_table[id].mode & OPT_MOUNT)) ++ if (!(bch2_opt_table[id].flags & OPT_MOUNT)) + goto bad_opt; + + if (id == Opt_acl && @@ -59210,7 +62101,7 @@ index 000000000000..a955ef2008c9 + ret = -1; + goto out; +bad_val: -+ pr_err("Invalid value %s for mount option %s", val, name); ++ pr_err("Invalid mount option %s", err.buf); + ret = -1; + goto out; +no_val: @@ -59219,9 +62110,71 @@ index 000000000000..a955ef2008c9 + goto out; +out: + kfree(copied_opts_start); ++ printbuf_exit(&err); + return ret; +} + ++u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) ++{ ++ const struct bch_option *opt = bch2_opt_table + id; ++ u64 v; ++ ++ v = opt->get_sb(sb); ++ ++ if (opt->flags & OPT_SB_FIELD_ILOG2) ++ v = 1ULL << v; ++ ++ if (opt->flags & OPT_SB_FIELD_SECTORS) ++ v <<= 9; ++ ++ return v; ++} ++ ++/* ++ * Initial options from superblock - here we don't want any options undefined, ++ * any options the superblock doesn't specify are set to 0: ++ */ ++int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) ++{ ++ unsigned id; ++ ++ for (id = 0; id < bch2_opts_nr; id++) { ++ const struct bch_option *opt = bch2_opt_table + id; ++ ++ if (opt->get_sb == BCH2_NO_SB_OPT) ++ continue; ++ ++ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); ++ } ++ ++ return 0; ++} ++ ++void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) ++{ ++ if (opt->set_sb == SET_BCH2_NO_SB_OPT) ++ return; ++ ++ if (opt->flags & OPT_SB_FIELD_SECTORS) ++ v >>= 9; ++ ++ if (opt->flags & OPT_SB_FIELD_ILOG2) ++ v = ilog2(v); ++ ++ opt->set_sb(sb, v); ++} ++ ++void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) ++{ ++ if (opt->set_sb == SET_BCH2_NO_SB_OPT) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ __bch2_opt_set_sb(c->disk_sb.sb, opt, v); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++} ++ +/* io opts: */ + +struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) @@ -59272,10 +62225,10 @@ index 000000000000..a955ef2008c9 +} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h new file mode 100644 -index 000000000000..afb1bb2a62d2 +index 000000000000..8bc67d07afb9 --- /dev/null +++ b/fs/bcachefs/opts.h -@@ -0,0 +1,470 @@ +@@ -0,0 +1,517 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_OPTS_H +#define _BCACHEFS_OPTS_H @@ -59286,6 +62239,7 @@ index 000000000000..afb1bb2a62d2 +#include +#include "bcachefs_format.h" + ++extern const char * const bch2_metadata_versions[]; +extern const char * const bch2_error_actions[]; +extern const char * const bch2_sb_features[]; +extern const char * const bch2_sb_compat[]; @@ -59297,8 +62251,9 @@ index 000000000000..afb1bb2a62d2 +extern const char * const bch2_str_hash_types[]; +extern const char * const bch2_str_hash_opts[]; +extern const char * const bch2_data_types[]; -+extern const char * const bch2_cache_replacement_policies[]; +extern const char * const bch2_member_states[]; ++extern const char * const bch2_jset_entry_types[]; ++extern const char * const bch2_fs_usage_types[]; +extern const char * const bch2_d_types[]; + +static inline const char *bch2_d_type_str(unsigned d_type) @@ -59319,21 +62274,26 @@ index 000000000000..afb1bb2a62d2 + */ + +/* dummy option, for options that aren't stored in the superblock */ -+LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); ++u64 BCH2_NO_SB_OPT(const struct bch_sb *); ++void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); + +/* When can be set: */ -+enum opt_mode { -+ OPT_FORMAT = (1 << 0), -+ OPT_MOUNT = (1 << 1), -+ OPT_RUNTIME = (1 << 2), -+ OPT_INODE = (1 << 3), -+ OPT_DEVICE = (1 << 4), ++enum opt_flags { ++ OPT_FS = (1 << 0), /* Filesystem option */ ++ OPT_DEVICE = (1 << 1), /* Device option */ ++ OPT_INODE = (1 << 2), /* Inode option */ ++ OPT_FORMAT = (1 << 3), /* May be specified at format time */ ++ OPT_MOUNT = (1 << 4), /* May be specified at mount time */ ++ OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ ++ OPT_HUMAN_READABLE = (1 << 6), ++ OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ ++ OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ ++ OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ +}; + +enum opt_type { + BCH_OPT_BOOL, + BCH_OPT_UINT, -+ BCH_OPT_SECTORS, + BCH_OPT_STR, + BCH_OPT_FN, +}; @@ -59358,281 +62318,315 @@ index 000000000000..afb1bb2a62d2 + */ + +#ifdef __KERNEL__ -+#define RATELIMIT_ERRORS true ++#define RATELIMIT_ERRORS_DEFAULT true +#else -+#define RATELIMIT_ERRORS false ++#define RATELIMIT_ERRORS_DEFAULT false +#endif + +#define BCH_OPTS() \ + x(block_size, u16, \ -+ OPT_FORMAT, \ -+ OPT_SECTORS(1, 128), \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(512, 1U << 16), \ + BCH_SB_BLOCK_SIZE, 8, \ + "size", NULL) \ -+ x(btree_node_size, u16, \ -+ OPT_FORMAT, \ -+ OPT_SECTORS(1, 512), \ ++ x(btree_node_size, u32, \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(512, 1U << 20), \ + BCH_SB_BTREE_NODE_SIZE, 512, \ + "size", "Btree node size, default 256k") \ + x(errors, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_error_actions), \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + NULL, "Action to take on filesystem error") \ + x(metadata_replicas, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_WANT, 1, \ + "#", "Number of metadata replicas") \ + x(data_replicas, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_WANT, 1, \ + "#", "Number of data replicas") \ + x(metadata_replicas_required, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(data_replicas_required, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_REQ, 1, \ + "#", NULL) \ ++ x(encoded_extent_max, u32, \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ ++ OPT_UINT(4096, 2U << 20), \ ++ BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ ++ "size", "Maximum size of checksummed/compressed extents")\ + x(metadata_checksum, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ + BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(data_checksum, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ + BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(compression, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_compression_opts), \ + BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(background_compression, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_compression_opts), \ + BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(str_hash, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_str_hash_opts), \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ + NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_METADATA_TARGET, 0, \ + "(target)", "Device or disk group for metadata writes") \ + x(foreground_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_FOREGROUND_TARGET, 0, \ + "(target)", "Device or disk group for foreground writes") \ + x(background_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_BACKGROUND_TARGET, 0, \ + "(target)", "Device or disk group to move data to in the background")\ + x(promote_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_PROMOTE_TARGET, 0, \ + "(target)", "Device or disk group to promote data to on read")\ + x(erasure_code, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_ERASURE_CODE, false, \ + NULL, "Enable erasure coding (DO NOT USE YET)") \ + x(inodes_32bit, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_INODE_32BIT, true, \ + NULL, "Constrain inode numbers to 32 bits") \ + x(shard_inode_numbers, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_SHARD_INUMS, true, \ + NULL, "Shard new inode numbers by CPU id") \ + x(inodes_use_key_cache, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_INODES_USE_KEY_CACHE, true, \ + NULL, "Use the btree key cache for the inodes btree") \ + x(btree_node_mem_ptr_optimization, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ -+ NO_SB_OPT, true, \ ++ BCH2_NO_SB_OPT, true, \ + NULL, "Stash pointer to in memory btree node in btree ptr")\ + x(gc_reserve_percent, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(5, 21), \ + BCH_SB_GC_RESERVE, 8, \ + "%", "Percentage of disk space to reserve for copygc")\ + x(gc_reserve_bytes, u64, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_SECTORS(0, U64_MAX), \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ ++ OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(0, U64_MAX), \ + BCH_SB_GC_RESERVE_BYTES, 0, \ + "%", "Amount of disk space to reserve for copygc\n" \ + "Takes precedence over gc_reserve_percent if set")\ + x(root_reserve_percent, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(0, 100), \ + BCH_SB_ROOT_RESERVE, 0, \ + "%", "Percentage of disk space to reserve for superuser")\ + x(wide_macs, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_128_BIT_MACS, false, \ + NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ + x(inline_data, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ -+ NO_SB_OPT, true, \ ++ BCH2_NO_SB_OPT, true, \ + NULL, "Enable inline data extents") \ + x(acl, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_POSIX_ACL, true, \ + NULL, "Enable POSIX acls") \ + x(usrquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_USRQUOTA, false, \ + NULL, "Enable user quotas") \ + x(grpquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_GRPQUOTA, false, \ + NULL, "Enable group quotas") \ + x(prjquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_PRJQUOTA, false, \ + NULL, "Enable project quotas") \ + x(degraded, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Allow mounting in degraded mode") \ + x(very_degraded, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Allow mounting in when data will be missing") \ + x(discard, u8, \ -+ OPT_MOUNT|OPT_DEVICE, \ ++ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, true, \ + NULL, "Enable discard/TRIM support") \ + x(verbose, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Extra debugging information during mount/recovery")\ ++ x(journal_flush_delay, u32, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1, U32_MAX), \ ++ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ ++ NULL, "Delay in milliseconds before automatic journal commits")\ + x(journal_flush_disabled, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ + NULL, "Disable journal flush on sync/fsync\n" \ + "If enabled, writes can be lost, but only since the\n"\ + "last journal write (default 1 second)") \ ++ x(journal_reclaim_delay, u32, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(0, U32_MAX), \ ++ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ ++ NULL, "Delay in milliseconds before automatic journal reclaim")\ + x(fsck, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Run fsck on mount") \ + x(fix_errors, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Fix errors during fsck without asking") \ + x(ratelimit_errors, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, RATELIMIT_ERRORS, \ ++ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ + NULL, "Ratelimit error messages during fsck") \ + x(nochanges, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Super read only mode - no writes at all will be issued,\n"\ + "even if we have to replay the journal") \ + x(norecovery, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Don't replay the journal") \ + x(rebuild_replicas, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Rebuild the superblock replicas section") \ + x(keep_journal, u8, \ -+ OPT_MOUNT, \ ++ 0, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Don't free journal entries/keys after startup")\ + x(read_entire_journal, u8, \ + 0, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Read all journal entries, not just dirty ones")\ -+ x(noexcl, u8, \ -+ OPT_MOUNT, \ ++ x(read_journal_only, u8, \ ++ 0, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Only read the journal, skip the rest of recovery")\ ++ x(journal_transaction_names, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ ++ NULL, "Log transaction function names in journal") \ ++ x(noexcl, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Don't open device in exclusive mode") \ + x(sb, u64, \ + OPT_MOUNT, \ + OPT_UINT(0, S64_MAX), \ -+ NO_SB_OPT, BCH_SB_SECTOR, \ ++ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ + "offset", "Sector offset of superblock") \ + x(read_only, u8, \ -+ 0, \ ++ OPT_FS, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, NULL) \ + x(nostart, u8, \ + 0, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Don\'t start filesystem, only open devices") \ + x(reconstruct_alloc, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Reconstruct alloc btree") \ + x(version_upgrade, u8, \ -+ OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, "Set superblock to latest version,\n" \ + "allowing any new features to be used") \ ++ x(buckets_nouse, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Allocate the buckets_nouse bitmap") \ + x(project, u8, \ + OPT_INODE, \ + OPT_BOOL(), \ -+ NO_SB_OPT, false, \ ++ BCH2_NO_SB_OPT, false, \ + NULL, NULL) \ + x(fs_size, u64, \ + OPT_DEVICE, \ -+ OPT_SECTORS(0, S64_MAX), \ -+ NO_SB_OPT, 0, \ ++ OPT_UINT(0, S64_MAX), \ ++ BCH2_NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(bucket, u32, \ + OPT_DEVICE, \ -+ OPT_SECTORS(0, S64_MAX), \ -+ NO_SB_OPT, 0, \ ++ OPT_UINT(0, S64_MAX), \ ++ BCH2_NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(durability, u8, \ + OPT_DEVICE, \ + OPT_UINT(0, BCH_REPLICAS_MAX), \ -+ NO_SB_OPT, 1, \ ++ BCH2_NO_SB_OPT, 1, \ + "n", "Data written to this device will be considered\n"\ + "to have already been replicated n times") + @@ -59685,20 +62679,21 @@ index 000000000000..afb1bb2a62d2 + +struct bch_option { + struct attribute attr; ++ u64 (*get_sb)(const struct bch_sb *); + void (*set_sb)(struct bch_sb *, u64); -+ enum opt_mode mode; + enum opt_type type; ++ enum opt_flags flags; ++ u64 min, max; + + union { + struct { -+ u64 min, max; + }; + struct { + const char * const *choices; + }; + struct { + int (*parse)(struct bch_fs *, const char *, u64 *); -+ void (*to_text)(struct printbuf *, struct bch_fs *, u64); ++ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + }; + }; + @@ -59713,15 +62708,20 @@ index 000000000000..afb1bb2a62d2 +u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); +void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); + -+struct bch_opts bch2_opts_from_sb(struct bch_sb *); ++u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); ++int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); ++void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); ++void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); + +int bch2_opt_lookup(const char *); -+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); ++int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); ++int bch2_opt_parse(struct bch_fs *, const struct bch_option *, ++ const char *, u64 *, struct printbuf *); + +#define OPT_SHOW_FULL_LIST (1 << 0) +#define OPT_SHOW_MOUNT_STYLE (1 << 1) + -+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, ++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, + const struct bch_option *, u64, unsigned); + +int bch2_opt_check_may_set(struct bch_fs *, int, u64); @@ -59748,10 +62748,10 @@ index 000000000000..afb1bb2a62d2 +#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c new file mode 100644 -index 000000000000..8f8f4b0accd6 +index 000000000000..ca029a00e7b8 --- /dev/null +++ b/fs/bcachefs/quota.c -@@ -0,0 +1,821 @@ +@@ -0,0 +1,852 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" @@ -59760,19 +62760,55 @@ index 000000000000..8f8f4b0accd6 +#include "subvolume.h" +#include "super-io.h" + -+static const char *bch2_sb_validate_quota(struct bch_sb *sb, -+ struct bch_sb_field *f) ++static const char * const bch2_quota_types[] = { ++ "user", ++ "group", ++ "project", ++}; ++ ++static const char * const bch2_quota_counters[] = { ++ "space", ++ "inodes", ++}; ++ ++static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + -+ if (vstruct_bytes(&q->field) != sizeof(*q)) -+ return "invalid field quota: wrong size"; ++ if (vstruct_bytes(&q->field) < sizeof(*q)) { ++ pr_buf(err, "wrong size (got %zu should be %zu)", ++ vstruct_bytes(&q->field), sizeof(*q)); ++ return -EINVAL; ++ } + -+ return NULL; ++ return 0; ++} ++ ++static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_quota *q = field_to_type(f, quota); ++ unsigned qtyp, counter; ++ ++ for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { ++ pr_buf(out, "%s: flags %llx", ++ bch2_quota_types[qtyp], ++ le64_to_cpu(q->q[qtyp].flags)); ++ ++ for (counter = 0; counter < Q_COUNTERS; counter++) ++ pr_buf(out, " %s timelimit %u warnlimit %u", ++ bch2_quota_counters[counter], ++ le32_to_cpu(q->q[qtyp].c[counter].timelimit), ++ le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); ++ ++ pr_newline(out); ++ } +} + +const struct bch_sb_field_ops bch_sb_field_ops_quota = { -+ .validate = bch2_sb_validate_quota, ++ .validate = bch2_sb_quota_validate, ++ .to_text = bch2_sb_quota_to_text, +}; + +const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -59786,11 +62822,6 @@ index 000000000000..8f8f4b0accd6 + return NULL; +} + -+static const char * const bch2_quota_counters[] = { -+ "space", -+ "inodes", -+}; -+ +void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ @@ -60324,7 +63355,7 @@ index 000000000000..8f8f4b0accd6 + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_USR, 0), + POS(QTYP_USR + 1, 0), -+ NULL); ++ 0, NULL); + if (ret) + return ret; + } @@ -60336,7 +63367,7 @@ index 000000000000..8f8f4b0accd6 + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_GRP, 0), + POS(QTYP_GRP + 1, 0), -+ NULL); ++ 0, NULL); + if (ret) + return ret; + } @@ -60348,7 +63379,7 @@ index 000000000000..8f8f4b0accd6 + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_PRJ, 0), + POS(QTYP_PRJ + 1, 0), -+ NULL); ++ 0, NULL); + if (ret) + return ret; + } @@ -60701,10 +63732,10 @@ index 000000000000..6a136083d389 +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 -index 000000000000..a573fede05b1 +index 000000000000..d914892f5339 --- /dev/null +++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,337 @@ +@@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -60964,35 +63995,47 @@ index 000000000000..a573fede05b1 +{ + struct bch_fs_rebalance *r = &c->rebalance; + struct rebalance_work w = rebalance_work(c); -+ char h1[21], h2[21]; + -+ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); -+ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); -+ pr_buf(out, "fullest_dev (%i):\t%s/%s\n", -+ w.dev_most_full_idx, h1, h2); ++ out->tabstops[0] = 20; + -+ bch2_hprint(&PBUF(h1), w.total_work << 9); -+ bch2_hprint(&PBUF(h2), c->capacity << 9); -+ pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); ++ pr_buf(out, "fullest_dev (%i):", w.dev_most_full_idx); ++ pr_tab(out); + -+ pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); ++ bch2_hprint(out, w.dev_most_full_work << 9); ++ pr_buf(out, "/"); ++ bch2_hprint(out, w.dev_most_full_capacity << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "total work:"); ++ pr_tab(out); ++ ++ bch2_hprint(out, w.total_work << 9); ++ pr_buf(out, "/"); ++ bch2_hprint(out, c->capacity << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "rate:"); ++ pr_tab(out); ++ pr_buf(out, "%u", r->pd.rate.rate); ++ pr_newline(out); + + switch (r->state) { + case REBALANCE_WAITING: -+ pr_buf(out, "waiting\n"); ++ pr_buf(out, "waiting"); + break; + case REBALANCE_THROTTLED: -+ bch2_hprint(&PBUF(h1), ++ pr_buf(out, "throttled for %lu sec or ", ++ (r->throttled_until_cputime - jiffies) / HZ); ++ bch2_hprint(out, + (r->throttled_until_iotime - + atomic64_read(&c->io_clock[WRITE].now)) << 9); -+ pr_buf(out, "throttled for %lu sec or %s io\n", -+ (r->throttled_until_cputime - jiffies) / HZ, -+ h1); ++ pr_buf(out, " io"); + break; + case REBALANCE_RUNNING: -+ pr_buf(out, "running\n"); ++ pr_buf(out, "running"); + break; + } ++ pr_newline(out); +} + +void bch2_rebalance_stop(struct bch_fs *c) @@ -61110,10 +64153,10 @@ index 000000000000..7462a92e9598 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 -index 000000000000..c3b4d116275c +index 000000000000..ca92fe84c248 --- /dev/null +++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1498 @@ +@@ -0,0 +1,1472 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -61132,6 +64175,7 @@ index 000000000000..c3b4d116275c +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" ++#include "lru.h" +#include "move.h" +#include "quota.h" +#include "recovery.h" @@ -61175,23 +64219,21 @@ index 000000000000..c3b4d116275c +static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, -+ struct journal_key *r) ++ const struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bpos_cmp(l_pos, r->k->k.p)); +} + -+static int journal_key_cmp(struct journal_key *l, struct journal_key *r) ++static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +{ -+ return (cmp_int(l->btree_id, r->btree_id) ?: -+ cmp_int(l->level, r->level) ?: -+ bpos_cmp(l->k->k.p, r->k->k.p)); ++ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); +} + -+static size_t journal_key_search(struct journal_keys *journal_keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) ++size_t bch2_journal_key_search(struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) +{ + size_t l = 0, r = journal_keys->nr, m; + @@ -61212,6 +64254,24 @@ index 000000000000..c3b4d116275c + return l; +} + ++struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, struct bpos pos) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_key *end = keys->d + keys->nr; ++ struct journal_key *k = keys->d + ++ bch2_journal_key_search(keys, btree_id, level, pos); ++ ++ while (k < end && k->overwritten) ++ k++; ++ ++ if (k < end && ++ k->btree_id == btree_id && ++ k->level == level) ++ return k->k; ++ return NULL; ++} ++ +static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) +{ + struct bkey_i *n = iter->keys->d[idx].k; @@ -61225,18 +64285,25 @@ index 000000000000..c3b4d116275c + iter->idx++; +} + -+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_i *k) ++int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, -+ .allocated = true ++ .allocated = true, ++ /* ++ * Ensure these keys are done last by journal replay, to unblock ++ * journal reclaim: ++ */ ++ .journal_seq = U32_MAX, + }; + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; -+ unsigned idx = journal_key_search(keys, id, level, k->k.p); ++ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); ++ ++ BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + + if (idx < keys->nr && + journal_key_cmp(&n, &keys->d[idx]) == 0) { @@ -61273,38 +64340,66 @@ index 000000000000..c3b4d116275c + return 0; +} + ++/* ++ * Can only be used from the recovery thread while we're still RO - can't be ++ * used once we've got RW, as journal_keys is at that point used by multiple ++ * threads: ++ */ ++int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ struct bkey_i *n; ++ int ret; ++ ++ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ bkey_copy(n, k); ++ ret = bch2_journal_key_insert_take(c, id, level, n); ++ if (ret) ++ kfree(n); ++ return ret; ++} ++ +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ -+ struct bkey_i *whiteout = -+ kmalloc(sizeof(struct bkey), GFP_KERNEL); -+ int ret; ++ struct bkey_i whiteout; + -+ if (!whiteout) { -+ bch_err(c, "%s: error allocating new key", __func__); -+ return -ENOMEM; -+ } ++ bkey_init(&whiteout.k); ++ whiteout.k.p = pos; + -+ bkey_init(&whiteout->k); -+ whiteout->k.p = pos; ++ return bch2_journal_key_insert(c, id, level, &whiteout); ++} + -+ ret = bch2_journal_key_insert(c, id, level, whiteout); -+ if (ret) -+ kfree(whiteout); -+ return ret; ++void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, ++ unsigned level, struct bpos pos) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ size_t idx = bch2_journal_key_search(keys, btree, level, pos); ++ ++ if (idx < keys->nr && ++ keys->d[idx].btree_id == btree && ++ keys->d[idx].level == level && ++ !bpos_cmp(keys->d[idx].k->k.p, pos)) ++ keys->d[idx].overwritten = true; +} + +static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) +{ -+ struct journal_key *k = iter->idx - iter->keys->nr -+ ? iter->keys->d + iter->idx : NULL; ++ struct journal_key *k = iter->keys->d + iter->idx; + -+ if (k && -+ k->btree_id == iter->btree_id && -+ k->level == iter->level) -+ return k->k; ++ while (k < iter->keys->d + iter->keys->nr && ++ k->btree_id == iter->btree_id && ++ k->level == iter->level) { ++ if (!k->overwritten) ++ return k->k; ++ ++ iter->idx++; ++ k = iter->keys->d + iter->idx; ++ } + -+ iter->idx = iter->keys->nr; + return NULL; +} + @@ -61327,8 +64422,7 @@ index 000000000000..c3b4d116275c + iter->btree_id = id; + iter->level = level; + iter->keys = &c->journal_keys; -+ iter->idx = journal_key_search(&c->journal_keys, id, level, pos); -+ list_add(&iter->list, &c->journal_iters); ++ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); +} + +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -61414,106 +64508,33 @@ index 000000000000..c3b4d116275c + bch2_journal_iter_exit(&iter->journal); +} + -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, -+ struct bch_fs *c, -+ struct btree *b) ++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct bch_fs *c, ++ struct btree *b, ++ struct btree_node_iter node_iter, ++ struct bpos pos) +{ + memset(iter, 0, sizeof(*iter)); + + iter->b = b; -+ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); -+ bch2_journal_iter_init(c, &iter->journal, -+ b->c.btree_id, b->c.level, b->data->min_key); ++ iter->node_iter = node_iter; ++ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); ++ INIT_LIST_HEAD(&iter->journal.list); +} + -+/* Walk btree, overlaying keys from the journal: */ -+ -+static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, -+ struct btree_and_journal_iter iter) ++/* ++ * this version is used by btree_gc before filesystem has gone RW and ++ * multithreaded, so uses the journal_iters list: ++ */ ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct bch_fs *c, ++ struct btree *b) +{ -+ unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; -+ struct bkey_s_c k; -+ struct bkey_buf tmp; ++ struct btree_node_iter node_iter; + -+ BUG_ON(!b->c.level); -+ -+ bch2_bkey_buf_init(&tmp); -+ -+ while (i < nr && -+ (k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ bch2_bkey_buf_reassemble(&tmp, c, k); -+ -+ bch2_btree_node_prefetch(c, NULL, NULL, tmp.k, -+ b->c.btree_id, b->c.level - 1); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ i++; -+ } -+ -+ bch2_bkey_buf_exit(&tmp, c); -+} -+ -+static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b, -+ enum btree_id btree_id, -+ btree_walk_key_fn key_fn) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ struct bkey_buf tmp; -+ struct btree *child; -+ int ret = 0; -+ -+ bch2_bkey_buf_init(&tmp); -+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ if (b->c.level) { -+ bch2_bkey_buf_reassemble(&tmp, c, k); -+ -+ child = bch2_btree_node_get_noiter(c, tmp.k, -+ b->c.btree_id, b->c.level - 1, -+ false); -+ -+ ret = PTR_ERR_OR_ZERO(child); -+ if (ret) -+ break; -+ -+ btree_and_journal_iter_prefetch(c, b, iter); -+ -+ ret = bch2_btree_and_journal_walk_recurse(trans, child, -+ btree_id, key_fn); -+ six_unlock_read(&child->c.lock); -+ } else { -+ ret = key_fn(trans, k); -+ } -+ -+ if (ret) -+ break; -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ } -+ -+ bch2_btree_and_journal_iter_exit(&iter); -+ bch2_bkey_buf_exit(&tmp, c); -+ return ret; -+} -+ -+int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id, -+ btree_walk_key_fn key_fn) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = c->btree_roots[btree_id].b; -+ int ret = 0; -+ -+ if (btree_node_fake(b)) -+ return 0; -+ -+ six_lock_read(&b->c.lock, NULL, NULL); -+ ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn); -+ six_unlock_read(&b->c.lock); -+ -+ return ret; ++ bch2_btree_node_iter_init_from_start(&node_iter, b); ++ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); ++ list_add(&iter->journal.list, &c->journal_iters); +} + +/* sort and dedup all keys in the journal: */ @@ -61538,9 +64559,7 @@ index 000000000000..c3b4d116275c + const struct journal_key *l = _l; + const struct journal_key *r = _r; + -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ cmp_int(l->level, r->level) ?: -+ bpos_cmp(l->k->k.p, r->k->k.p) ?: ++ return journal_key_cmp(l, r) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} @@ -61633,8 +64652,8 @@ index 000000000000..c3b4d116275c + bch2_journal_pin_put(j, j->replay_journal_seq++); +} + -+static int __bch2_journal_replay_key(struct btree_trans *trans, -+ struct journal_key *k) ++static int bch2_journal_replay_key(struct btree_trans *trans, ++ struct journal_key *k) +{ + struct btree_iter iter; + unsigned iter_flags = @@ -61643,111 +64662,75 @@ index 000000000000..c3b4d116275c + int ret; + + if (!k->level && k->btree_id == BTREE_ID_alloc) -+ iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; ++ iter_flags |= BTREE_ITER_CACHED; + + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + iter_flags); -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); ++ ret = bch2_btree_iter_traverse(&iter); ++ if (ret) ++ goto out; ++ ++ /* Must be checked with btree locked: */ ++ if (k->overwritten) ++ goto out; ++ ++ ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); ++out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + -+static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) -+{ -+ unsigned commit_flags = -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_JOURNAL_RESERVED; -+ -+ if (!k->allocated) -+ commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; -+ -+ return bch2_trans_do(c, NULL, NULL, commit_flags, -+ __bch2_journal_replay_key(&trans, k)); -+} -+ +static int journal_sort_seq_cmp(const void *_l, const void *_r) +{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; ++ const struct journal_key *l = *((const struct journal_key **)_l); ++ const struct journal_key *r = *((const struct journal_key **)_r); + -+ return cmp_int(r->level, l->level) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->btree_id, r->btree_id) ?: -+ bpos_cmp(l->k->k.p, r->k->k.p); ++ return cmp_int(l->journal_seq, r->journal_seq); +} + -+static int bch2_journal_replay(struct bch_fs *c, -+ struct journal_keys keys) ++static int bch2_journal_replay(struct bch_fs *c) +{ ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_key **keys_sorted, *k; + struct journal *j = &c->journal; -+ struct journal_key *i; -+ u64 seq; ++ size_t i; + int ret; + -+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); ++ keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); ++ if (!keys_sorted) ++ return -ENOMEM; + -+ if (keys.nr) -+ replay_now_at(j, keys.journal_seq_base); ++ for (i = 0; i < keys->nr; i++) ++ keys_sorted[i] = &keys->d[i]; + -+ seq = j->replay_journal_seq; ++ sort(keys_sorted, keys->nr, ++ sizeof(keys_sorted[0]), ++ journal_sort_seq_cmp, NULL); ++ ++ if (keys->nr) ++ replay_now_at(j, keys->journal_seq_base); ++ ++ for (i = 0; i < keys->nr; i++) { ++ k = keys_sorted[i]; + -+ /* -+ * First replay updates to the alloc btree - these will only update the -+ * btree key cache: -+ */ -+ for_each_journal_key(keys, i) { + cond_resched(); + -+ if (!i->level && i->btree_id == BTREE_ID_alloc) { -+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; -+ ret = bch2_journal_replay_key(c, i); -+ if (ret) -+ goto err; -+ } -+ } ++ if (!k->allocated) ++ replay_now_at(j, keys->journal_seq_base + k->journal_seq); + -+ /* -+ * Next replay updates to interior btree nodes: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (i->level) { -+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; -+ ret = bch2_journal_replay_key(c, i); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ /* -+ * Now that the btree is in a consistent state, we can start journal -+ * reclaim (which will be flushing entries from the btree key cache back -+ * to the btree: -+ */ -+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); -+ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); -+ journal_reclaim_kick(j); -+ -+ j->replay_journal_seq = seq; -+ -+ /* -+ * Now replay leaf node updates: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (i->level || i->btree_id == BTREE_ID_alloc) -+ continue; -+ -+ replay_now_at(j, keys.journal_seq_base + i->journal_seq); -+ -+ ret = bch2_journal_replay_key(c, i); -+ if (ret) ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL| ++ (!k->allocated ++ ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved ++ : 0), ++ bch2_journal_replay_key(&trans, k)); ++ if (ret) { ++ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", ++ ret, bch2_btree_ids[k->btree_id], k->level); + goto err; ++ } + } + + replay_now_at(j, j->replay_journal_seq_end); @@ -61755,10 +64738,12 @@ index 000000000000..c3b4d116275c + + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); -+ return bch2_journal_error(j); ++ ret = bch2_journal_error(j); ++ ++ if (keys->nr && !ret) ++ bch2_journal_log_msg(&c->journal, "journal replay finished"); +err: -+ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", -+ ret, bch2_btree_ids[i->btree_id], i->level); ++ kvfree(keys_sorted); + return ret; +} + @@ -61796,15 +64781,15 @@ index 000000000000..c3b4d116275c + container_of(entry, struct jset_entry_usage, entry); + + switch (entry->btree_id) { -+ case FS_USAGE_RESERVED: ++ case BCH_FS_USAGE_reserved: + if (entry->level < BCH_REPLICAS_MAX) + c->usage_base->persistent_reserved[entry->level] = + le64_to_cpu(u->v); + break; -+ case FS_USAGE_INODES: ++ case BCH_FS_USAGE_inodes: + c->usage_base->nr_inodes = le64_to_cpu(u->v); + break; -+ case FS_USAGE_KEY_VERSION: ++ case BCH_FS_USAGE_key_version: + atomic64_set(&c->key_version, + le64_to_cpu(u->v)); + break; @@ -61824,10 +64809,7 @@ index 000000000000..c3b4d116275c + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / -+ sizeof(struct jset_entry_dev_usage_type); -+ unsigned i; ++ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); @@ -61942,6 +64924,8 @@ index 000000000000..c3b4d116275c +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, @@ -61954,7 +64938,6 @@ index 000000000000..c3b4d116275c + } + + for (i = 0; i < BTREE_ID_NR; i++) { -+ char buf1[200], buf2[200]; + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + @@ -61964,6 +64947,19 @@ index 000000000000..c3b4d116275c + if (!k1 && !k2) + continue; + ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ ++ if (k1) ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); ++ else ++ pr_buf(&buf1, "(none)"); ++ ++ if (k2) ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); ++ else ++ pr_buf(&buf2, "(none)"); ++ + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || @@ -61973,10 +64969,12 @@ index 000000000000..c3b4d116275c + "superblock btree root %u doesn't match journal after clean shutdown\n" + "sb: l=%u %s\n" + "journal: l=%u %s\n", i, -+ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), -+ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); ++ l1, buf1.buf, ++ l2, buf2.buf); + } +fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); + return ret; +} + @@ -62003,7 +65001,7 @@ index 000000000000..c3b4d116275c + return ERR_PTR(-ENOMEM); + } + -+ ret = bch2_sb_clean_validate(c, clean, READ); ++ ret = bch2_sb_clean_validate_late(c, clean, READ); + if (ret) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); @@ -62099,7 +65097,6 @@ index 000000000000..c3b4d116275c + +static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ -+ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; @@ -62113,7 +65110,7 @@ index 000000000000..c3b4d116275c + goto err; + + if (!bkey_is_inode(k.k)) { -+ bch_err(c, "root inode not found"); ++ bch_err(trans->c, "root inode not found"); + ret = -ENOENT; + goto err; + } @@ -62191,8 +65188,8 @@ index 000000000000..c3b4d116275c + bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); + c->opts.version_upgrade = true; + c->opts.fsck = true; -+ } else if (c->sb.version < bcachefs_metadata_version_inode_v2) { -+ bch_info(c, "filesystem version is prior to inode_v2 - upgrading"); ++ } else if (c->sb.version < bcachefs_metadata_version_alloc_v4) { ++ bch_info(c, "filesystem version is prior to alloc_v4 - upgrading"); + c->opts.version_upgrade = true; + } + } @@ -62206,6 +65203,7 @@ index 000000000000..c3b4d116275c + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + struct journal_replay *i; + ++ bch_verbose(c, "starting journal read"); + ret = bch2_journal_read(c, &c->journal_entries, + &blacklist_seq, &journal_seq); + if (ret) @@ -62254,6 +65252,9 @@ index 000000000000..c3b4d116275c + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + ++ if (c->opts.read_journal_only) ++ goto out; ++ + if (c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + drop_alloc_keys(&c->journal_keys); @@ -62295,7 +65296,11 @@ index 000000000000..c3b4d116275c + + bch_verbose(c, "starting alloc read"); + err = "error reading allocation information"; ++ ++ down_read(&c->gc_lock); + ret = bch2_alloc_read(c); ++ up_read(&c->gc_lock); ++ + if (ret) + goto err; + bch_verbose(c, "alloc read done"); @@ -62307,7 +65312,12 @@ index 000000000000..c3b4d116275c + goto err; + bch_verbose(c, "stripes_read done"); + -+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ /* ++ * If we're not running fsck, this ensures bch2_fsck_err() calls are ++ * instead interpreted as bch2_inconsistent_err() calls: ++ */ ++ if (!c->opts.fsck) ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); + + if (c->opts.fsck || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || @@ -62315,18 +65325,32 @@ index 000000000000..c3b4d116275c + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + bool metadata_only = c->opts.norecovery; + -+ bch_info(c, "starting mark and sweep"); -+ err = "error in mark and sweep"; ++ bch_info(c, "checking allocations"); ++ err = "error checking allocations"; + ret = bch2_gc(c, true, metadata_only); + if (ret) + goto err; -+ bch_verbose(c, "mark and sweep done"); ++ bch_verbose(c, "done checking allocations"); ++ } ++ ++ if (c->opts.fsck) { ++ bch_info(c, "checking need_discard and freespace btrees"); ++ err = "error checking need_discard and freespace btrees"; ++ ret = bch2_check_alloc_info(c, true); ++ if (ret) ++ goto err; ++ ++ ret = bch2_check_lrus(c, true); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking need_discard and freespace btrees"); + } + + bch2_stripes_heap_start(c); + + clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); + + /* + * Skip past versions that might have possibly been used (as nonces), @@ -62338,30 +65362,18 @@ index 000000000000..c3b4d116275c + if (c->opts.norecovery) + goto out; + -+ bch_verbose(c, "starting journal replay"); ++ bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); + err = "journal replay failed"; -+ ret = bch2_journal_replay(c, c->journal_keys); ++ ret = bch2_journal_replay(c); + if (ret) + goto err; -+ bch_verbose(c, "journal replay done"); ++ if (c->opts.verbose || !c->sb.clean) ++ bch_info(c, "journal replay done"); + -+ if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && -+ !c->opts.nochanges) { -+ /* -+ * note that even when filesystem was clean there might be work -+ * to do here, if we ran gc (because of fsck) which recalculated -+ * oldest_gen: -+ */ -+ bch_verbose(c, "writing allocation info"); -+ err = "error writing out alloc info"; -+ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?: -+ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); -+ if (ret) { -+ bch_err(c, "error writing alloc info"); -+ goto err; -+ } -+ bch_verbose(c, "alloc write done"); -+ } ++ err = "error initializing freespace"; ++ ret = bch2_fs_freespace_init(c); ++ if (ret) ++ goto err; + + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + bch2_fs_lazy_rw(c); @@ -62412,23 +65424,6 @@ index 000000000000..c3b4d116275c + bch_verbose(c, "quotas done"); + } + -+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || -+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { -+ struct bch_move_stats stats; -+ -+ bch_move_stats_init(&stats, "recovery"); -+ -+ bch_info(c, "scanning for old btree nodes"); -+ ret = bch2_fs_read_write(c); -+ if (ret) -+ goto err; -+ -+ ret = bch2_scan_old_btree_nodes(c, &stats); -+ if (ret) -+ goto err; -+ bch_info(c, "scanning for old btree nodes done"); -+ } -+ + mutex_lock(&c->sb_lock); + if (c->opts.version_upgrade) { + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); @@ -62453,6 +65448,24 @@ index 000000000000..c3b4d116275c + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + ++ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || ++ le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { ++ struct bch_move_stats stats; ++ ++ bch_move_stats_init(&stats, "recovery"); ++ ++ bch_info(c, "scanning for old btree nodes"); ++ ret = bch2_fs_read_write(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_scan_old_btree_nodes(c, &stats); ++ if (ret) ++ goto err; ++ bch_info(c, "scanning for old btree nodes done"); ++ } ++ + if (c->journal_seq_blacklist_table && + c->journal_seq_blacklist_table->nr > 128) + queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); @@ -62500,20 +65513,15 @@ index 000000000000..c3b4d116275c + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + bch2_write_super(c); + } -+ -+ for_each_online_member(ca, c, i) -+ bch2_mark_dev_superblock(c, ca, 0); + mutex_unlock(&c->sb_lock); + -+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); + + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + -+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); -+ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); -+ + err = "unable to allocate journal buckets"; + for_each_online_member(ca, c, i) { + ret = bch2_dev_journal_alloc(ca); @@ -62539,6 +65547,7 @@ index 000000000000..c3b4d116275c + * Write out the superblock and journal buckets, now that we can do + * btree updates + */ ++ bch_verbose(c, "marking superblocks"); + err = "error marking superblock and journal"; + for_each_member_device(ca, c, i) { + ret = bch2_trans_mark_dev_sb(c, ca); @@ -62546,8 +65555,16 @@ index 000000000000..c3b4d116275c + percpu_ref_put(&ca->ref); + goto err; + } ++ ++ ca->new_fs_bucket_idx = 0; + } + ++ bch_verbose(c, "initializing freespace"); ++ err = "error initializing freespace"; ++ ret = bch2_fs_freespace_init(c); ++ if (ret) ++ goto err; ++ + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) @@ -62614,10 +65631,10 @@ index 000000000000..c3b4d116275c +} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h new file mode 100644 -index 000000000000..e45c70b3693f +index 000000000000..e6927a918df3 --- /dev/null +++ b/fs/bcachefs/recovery.h -@@ -0,0 +1,58 @@ +@@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_H +#define _BCACHEFS_RECOVERY_H @@ -62651,24 +65668,32 @@ index 000000000000..e45c70b3693f + } last; +}; + ++size_t bch2_journal_key_search(struct journal_keys *, enum btree_id, ++ unsigned, struct bpos); ++struct bkey_i *bch2_journal_keys_peek(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos pos); ++ ++int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, ++ unsigned, struct bkey_i *); +int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); ++void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); ++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct bch_fs *, struct btree *, ++ struct btree_node_iter, struct bpos); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, + struct btree *); + -+typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c); -+ -+int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn); -+ +void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_entries_free(struct list_head *); + @@ -62678,10 +65703,10 @@ index 000000000000..e45c70b3693f +#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 -index 000000000000..8dcac7815c9f +index 000000000000..6824730945d4 --- /dev/null +++ b/fs/bcachefs/reflink.c -@@ -0,0 +1,367 @@ +@@ -0,0 +1,404 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" @@ -62782,6 +65807,24 @@ index 000000000000..8dcac7815c9f + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} + ++int bch2_trans_mark_reflink_v(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); ++ ++ if (!r->v.refcount) { ++ r->k.type = KEY_TYPE_deleted; ++ r->k.size = 0; ++ set_bkey_val_u64s(&r->k, 0); ++ return 0; ++ } ++ } ++ ++ return bch2_trans_mark_extent(trans, old, new, flags); ++} ++ +/* indirect inline data */ + +const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, @@ -62803,6 +65846,24 @@ index 000000000000..8dcac7815c9f + min(datalen, 32U), d.v->data); +} + ++int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ struct bkey_i_indirect_inline_data *r = ++ bkey_i_to_indirect_inline_data(new); ++ ++ if (!r->v.refcount) { ++ r->k.type = KEY_TYPE_deleted; ++ r->k.size = 0; ++ set_bkey_val_u64s(&r->k, 0); ++ } ++ } ++ ++ return 0; ++} ++ +static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *orig) @@ -62868,7 +65929,8 @@ index 000000000000..8dcac7815c9f + + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + -+ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); ++ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +err: + c->reflink_hint = reflink_iter.pos.offset; + bch2_trans_iter_exit(trans, &reflink_iter); @@ -63051,10 +66113,10 @@ index 000000000000..8dcac7815c9f +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 -index 000000000000..3745873fd88d +index 000000000000..8eb41c0292eb --- /dev/null +++ b/fs/bcachefs/reflink.h -@@ -0,0 +1,63 @@ +@@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_H +#define _BCACHEFS_REFLINK_H @@ -63067,27 +66129,37 @@ index 000000000000..3745873fd88d +#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ -+ .key_merge = bch2_reflink_p_merge, \ ++ .key_merge = bch2_reflink_p_merge, \ ++ .trans_trigger = bch2_trans_mark_reflink_p, \ ++ .atomic_trigger = bch2_mark_reflink_p, \ +} + +const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); ++int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c, ++ struct bkey_i *, unsigned); + +#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ + .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_reflink_v, \ ++ .atomic_trigger = bch2_mark_extent, \ +} + +const char *bch2_indirect_inline_data_invalid(const struct bch_fs *, + struct bkey_s_c); +void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); ++int bch2_trans_mark_indirect_inline_data(struct btree_trans *, ++ struct bkey_s_c, struct bkey_i *, ++ unsigned); + +#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \ + .key_invalid = bch2_indirect_inline_data_invalid, \ + .val_to_text = bch2_indirect_inline_data_to_text, \ ++ .trans_trigger = bch2_trans_mark_indirect_inline_data, \ +} + +static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) @@ -63120,10 +66192,10 @@ index 000000000000..3745873fd88d +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c new file mode 100644 -index 000000000000..002006593044 +index 000000000000..c2771112d573 --- /dev/null +++ b/fs/bcachefs/replicas.c -@@ -0,0 +1,1094 @@ +@@ -0,0 +1,1073 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -63162,23 +66234,40 @@ index 000000000000..002006593044 + eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); +} + ++void bch2_replicas_entry_v0_to_text(struct printbuf *out, ++ struct bch_replicas_entry_v0 *e) ++{ ++ unsigned i; ++ ++ if (e->data_type < BCH_DATA_NR) ++ pr_buf(out, "%s", bch2_data_types[e->data_type]); ++ else ++ pr_buf(out, "(invalid data type %u)", e->data_type); ++ ++ pr_buf(out, ": %u [", e->nr_devs); ++ for (i = 0; i < e->nr_devs; i++) ++ pr_buf(out, i ? " %u" : "%u", e->devs[i]); ++ pr_buf(out, "]"); ++} ++ +void bch2_replicas_entry_to_text(struct printbuf *out, + struct bch_replicas_entry *e) +{ + unsigned i; + -+ pr_buf(out, "%s: %u/%u [", -+ bch2_data_types[e->data_type], -+ e->nr_required, -+ e->nr_devs); ++ if (e->data_type < BCH_DATA_NR) ++ pr_buf(out, "%s", bch2_data_types[e->data_type]); ++ else ++ pr_buf(out, "(invalid data type %u)", e->data_type); + ++ pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs); + for (i = 0; i < e->nr_devs; i++) + pr_buf(out, i ? " %u" : "%u", e->devs[i]); + pr_buf(out, "]"); +} + +void bch2_cpu_replicas_to_text(struct printbuf *out, -+ struct bch_replicas_cpu *r) ++ struct bch_replicas_cpu *r) +{ + struct bch_replicas_entry *e; + bool first = true; @@ -63539,75 +66628,14 @@ index 000000000000..002006593044 + goto out; +} + -+static int __bch2_mark_replicas(struct bch_fs *c, -+ struct bch_replicas_entry *r, -+ bool check) -+{ -+ return likely(bch2_replicas_marked(c, r)) ? 0 -+ : check ? -1 -+ : bch2_mark_replicas_slowpath(c, r); -+} -+ +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +{ -+ return __bch2_mark_replicas(c, r, false); -+} -+ -+static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, -+ bool check) -+{ -+ struct bch_replicas_padded search; -+ struct bch_devs_list cached = bch2_bkey_cached_devs(k); -+ unsigned i; -+ int ret; -+ -+ memset(&search, 0, sizeof(search)); -+ -+ for (i = 0; i < cached.nr; i++) { -+ bch2_replicas_entry_cached(&search.e, cached.devs[i]); -+ -+ ret = __bch2_mark_replicas(c, &search.e, check); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_bkey_to_replicas(&search.e, k); -+ -+ ret = __bch2_mark_replicas(c, &search.e, check); -+ if (ret) -+ return ret; -+ -+ if (search.e.data_type == BCH_DATA_parity) { -+ search.e.data_type = BCH_DATA_cached; -+ ret = __bch2_mark_replicas(c, &search.e, check); -+ if (ret) -+ return ret; -+ -+ search.e.data_type = BCH_DATA_user; -+ ret = __bch2_mark_replicas(c, &search.e, check); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; ++ return likely(bch2_replicas_marked(c, r)) ++ ? 0 : bch2_mark_replicas_slowpath(c, r); +} + +/* replicas delta list: */ + -+bool bch2_replicas_delta_list_marked(struct bch_fs *c, -+ struct replicas_delta_list *r) -+{ -+ struct replicas_delta *d = r->d; -+ struct replicas_delta *top = (void *) r->d + r->used; -+ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ for (d = r->d; d != top; d = replicas_delta_next(d)) -+ if (bch2_replicas_entry_idx(c, &d->r) < 0) -+ return false; -+ return true; -+} -+ +int bch2_replicas_delta_list_mark(struct bch_fs *c, + struct replicas_delta_list *r) +{ @@ -63620,19 +66648,6 @@ index 000000000000..002006593044 + return ret; +} + -+/* bkey replicas: */ -+ -+bool bch2_bkey_replicas_marked(struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return __bch2_mark_bkey_replicas(c, k, true) == 0; -+} -+ -+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -+{ -+ return __bch2_mark_bkey_replicas(c, k, false); -+} -+ +/* + * Old replicas_gc mechanism: only used for journal replicas entries now, should + * die at some point: @@ -64000,67 +67015,78 @@ index 000000000000..002006593044 + return 0; +} + -+static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) ++static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, ++ struct bch_sb *sb, ++ struct printbuf *err) +{ -+ unsigned i; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ unsigned i, j; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + -+ for (i = 0; i + 1 < cpu_r->nr; i++) { -+ struct bch_replicas_entry *l = ++ for (i = 0; i < cpu_r->nr; i++) { ++ struct bch_replicas_entry *e = + cpu_replicas_entry(cpu_r, i); -+ struct bch_replicas_entry *r = -+ cpu_replicas_entry(cpu_r, i + 1); + -+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); ++ if (e->data_type >= BCH_DATA_NR) { ++ pr_buf(err, "invalid data type in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } + -+ if (!memcmp(l, r, cpu_r->entry_size)) -+ return "duplicate replicas entry"; ++ if (!e->nr_devs) { ++ pr_buf(err, "no devices in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ ++ if (e->nr_required > 1 && ++ e->nr_required >= e->nr_devs) { ++ pr_buf(err, "bad nr_required in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ ++ for (j = 0; j < e->nr_devs; j++) ++ if (!bch2_dev_exists(sb, mi, e->devs[j])) { ++ pr_buf(err, "invalid device %u in entry ", e->devs[j]); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ ++ if (i + 1 < cpu_r->nr) { ++ struct bch_replicas_entry *n = ++ cpu_replicas_entry(cpu_r, i + 1); ++ ++ BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); ++ ++ if (!memcmp(e, n, cpu_r->entry_size)) { ++ pr_buf(err, "duplicate replicas entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ } + } + -+ return NULL; ++ return 0; +} + -+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) ++static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) +{ + struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_replicas_cpu cpu_r = { .entries = NULL }; -+ struct bch_replicas_entry *e; -+ const char *err; -+ unsigned i; ++ struct bch_replicas_cpu cpu_r; ++ int ret; + -+ for_each_replicas_entry(sb_r, e) { -+ err = "invalid replicas entry: invalid data type"; -+ if (e->data_type >= BCH_DATA_NR) -+ goto err; -+ -+ err = "invalid replicas entry: no devices"; -+ if (!e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: bad nr_required"; -+ if (e->nr_required > 1 && -+ e->nr_required >= e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: invalid device"; -+ for (i = 0; i < e->nr_devs; i++) -+ if (!bch2_dev_exists(sb, mi, e->devs[i])) -+ goto err; -+ } -+ -+ err = "cannot allocate memory"; + if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) -+ goto err; ++ return -ENOMEM; + -+ err = check_dup_replicas_entries(&cpu_r); -+err: ++ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); -+ return err; ++ return ret; +} + +static void bch2_sb_replicas_to_text(struct printbuf *out, @@ -64078,49 +67104,50 @@ index 000000000000..002006593044 + + bch2_replicas_entry_to_text(out, e); + } ++ pr_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_replicas = { -+ .validate = bch2_sb_validate_replicas, ++ .validate = bch2_sb_replicas_validate, + .to_text = bch2_sb_replicas_to_text, +}; + -+static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) ++static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) +{ + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_replicas_cpu cpu_r = { .entries = NULL }; -+ struct bch_replicas_entry_v0 *e; -+ const char *err; -+ unsigned i; ++ struct bch_replicas_cpu cpu_r; ++ int ret; + -+ for_each_replicas_entry_v0(sb_r, e) { -+ err = "invalid replicas entry: invalid data type"; -+ if (e->data_type >= BCH_DATA_NR) -+ goto err; -+ -+ err = "invalid replicas entry: no devices"; -+ if (!e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: invalid device"; -+ for (i = 0; i < e->nr_devs; i++) -+ if (!bch2_dev_exists(sb, mi, e->devs[i])) -+ goto err; -+ } -+ -+ err = "cannot allocate memory"; + if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) -+ goto err; ++ return -ENOMEM; + -+ err = check_dup_replicas_entries(&cpu_r); -+err: ++ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); -+ return err; ++ return ret; ++} ++ ++static void bch2_sb_replicas_v0_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); ++ struct bch_replicas_entry_v0 *e; ++ bool first = true; ++ ++ for_each_replicas_entry(sb_r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_v0_to_text(out, e); ++ } ++ pr_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { -+ .validate = bch2_sb_validate_replicas_v0, ++ .validate = bch2_sb_replicas_v0_validate, ++ .to_text = bch2_sb_replicas_v0_to_text, +}; + +/* Query replicas: */ @@ -64161,11 +67188,12 @@ index 000000000000..002006593044 + + if (dflags & ~flags) { + if (print) { -+ char buf[100]; ++ struct printbuf buf = PRINTBUF; + -+ bch2_replicas_entry_to_text(&PBUF(buf), e); ++ bch2_replicas_entry_to_text(&buf, e); + bch_err(c, "insufficient devices online (%u) for replicas entry %s", -+ nr_online, buf); ++ nr_online, buf.buf); ++ printbuf_exit(&buf); + } + ret = false; + break; @@ -64177,19 +67205,42 @@ index 000000000000..002006593044 + return ret; +} + ++unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) ++{ ++ struct bch_sb_field_replicas *replicas; ++ struct bch_sb_field_replicas_v0 *replicas_v0; ++ unsigned i, data_has = 0; ++ ++ replicas = bch2_sb_get_replicas(sb); ++ replicas_v0 = bch2_sb_get_replicas_v0(sb); ++ ++ if (replicas) { ++ struct bch_replicas_entry *r; ++ ++ for_each_replicas_entry(replicas, r) ++ for (i = 0; i < r->nr_devs; i++) ++ if (r->devs[i] == dev) ++ data_has |= 1 << r->data_type; ++ } else if (replicas_v0) { ++ struct bch_replicas_entry_v0 *r; ++ ++ for_each_replicas_entry_v0(replicas_v0, r) ++ for (i = 0; i < r->nr_devs; i++) ++ if (r->devs[i] == dev) ++ data_has |= 1 << r->data_type; ++ } ++ ++ ++ return data_has; ++} ++ +unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +{ -+ struct bch_replicas_entry *e; -+ unsigned i, ret = 0; ++ unsigned ret; + -+ percpu_down_read(&c->mark_lock); -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ for (i = 0; i < e->nr_devs; i++) -+ if (e->devs[i] == ca->dev_idx) -+ ret |= 1 << e->data_type; -+ -+ percpu_up_read(&c->mark_lock); ++ mutex_lock(&c->sb_lock); ++ ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); ++ mutex_unlock(&c->sb_lock); + + return ret; +} @@ -64220,10 +67271,10 @@ index 000000000000..002006593044 +} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h new file mode 100644 -index 000000000000..72ac544f16d8 +index 000000000000..87820b2e1ad3 --- /dev/null +++ b/fs/bcachefs/replicas.h -@@ -0,0 +1,108 @@ +@@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_H +#define _BCACHEFS_REPLICAS_H @@ -64274,12 +67325,9 @@ index 000000000000..72ac544f16d8 + return (void *) d + replicas_entry_bytes(&d->r) + 8; +} + -+bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *); +int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); + +void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -+bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); -+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); + +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + unsigned dev) @@ -64293,6 +67341,7 @@ index 000000000000..72ac544f16d8 +bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, + unsigned, bool); + ++unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); +unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); + +int bch2_replicas_gc_end(struct bch_fs *, int); @@ -64622,10 +67671,10 @@ index 000000000000..3dfaf34a43b2 +#endif /* _SIPHASH_H_ */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h new file mode 100644 -index 000000000000..57d636740d2f +index 000000000000..591bbb9f8beb --- /dev/null +++ b/fs/bcachefs/str_hash.h -@@ -0,0 +1,358 @@ +@@ -0,0 +1,351 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_STR_HASH_H +#define _BCACHEFS_STR_HASH_H @@ -64791,12 +67840,10 @@ index 000000000000..57d636740d2f + if (ret) + return ret; + -+ for_each_btree_key_norestart(trans, *iter, desc.btree_id, ++ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), ++ POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|flags, k, ret) { -+ if (iter->pos.inode != inum.inum) -+ break; -+ + if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_key(k, key)) + return 0; @@ -64827,15 +67874,12 @@ index 000000000000..57d636740d2f + if (ret) + return ret; + -+ for_each_btree_key_norestart(trans, *iter, desc.btree_id, ++ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (iter->pos.inode != inum.inum) -+ break; -+ ++ POS(inum.inum, U64_MAX), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) + if (!is_visible_key(desc, inum, k)) + return 0; -+ } + bch2_trans_iter_exit(trans, iter); + + return ret ?: -ENOSPC; @@ -64888,14 +67932,12 @@ index 000000000000..57d636740d2f + if (ret) + return ret; + -+ for_each_btree_key_norestart(trans, iter, desc.btree_id, ++ for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, + SPOS(inum.inum, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), ++ POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (iter.pos.inode != inum.inum) -+ break; -+ + if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) + goto found; @@ -64986,10 +68028,10 @@ index 000000000000..57d636740d2f +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 -index 000000000000..7e909a118189 +index 000000000000..cdb89ba216cc --- /dev/null +++ b/fs/bcachefs/subvolume.c -@@ -0,0 +1,1084 @@ +@@ -0,0 +1,1075 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -65131,7 +68173,7 @@ index 000000000000..7e909a118189 + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + u32 id = k.k->p.offset, child[2]; -+ unsigned nr_live = 0, live_idx; ++ unsigned nr_live = 0, live_idx = 0; + + if (k.k->type != KEY_TYPE_snapshot) + continue; @@ -65143,7 +68185,7 @@ index 000000000000..7e909a118189 + for (i = 0; i < 2; i++) { + ret = snapshot_live(trans, child[i]); + if (ret < 0) -+ break; ++ goto err; + + if (ret) + live_idx = i; @@ -65154,6 +68196,7 @@ index 000000000000..7e909a118189 + ? snapshot_t(c, child[live_idx])->equiv + : id; + } ++err: + bch2_trans_iter_exit(trans, &iter); + + if (ret) @@ -65448,10 +68491,10 @@ index 000000000000..7e909a118189 + return ret; +} + -+static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) ++int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, ++ u32 *new_snapids, ++ u32 *snapshot_subvols, ++ unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n; @@ -65480,7 +68523,7 @@ index 000000000000..7e909a118189 + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) -+ return ret; ++ goto err; + + bkey_snapshot_init(&n->k_i); + n->k.p = iter.pos; @@ -65490,11 +68533,10 @@ index 000000000000..7e909a118189 + n->v.pad = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + -+ bch2_trans_update(trans, &iter, &n->k_i, 0); -+ -+ ret = bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); ++ ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?: ++ bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) -+ break; ++ goto err; + + new_snapids[i] = iter.pos.offset; + } @@ -65515,7 +68557,7 @@ index 000000000000..7e909a118189 + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) -+ return ret; ++ goto err; + + bkey_reassemble(&n->k_i, k); + @@ -65528,43 +68570,30 @@ index 000000000000..7e909a118189 + n->v.children[0] = cpu_to_le32(new_snapids[0]); + n->v.children[1] = cpu_to_le32(new_snapids[1]); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); -+ bch2_trans_update(trans, &iter, &n->k_i, 0); ++ ret = bch2_trans_update(trans, &iter, &n->k_i, 0); ++ if (ret) ++ goto err; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + -+static int snapshot_id_add(struct snapshot_id_list *s, u32 id) ++static int snapshot_id_add(snapshot_id_list *s, u32 id) +{ + BUG_ON(snapshot_list_has_id(s, id)); + -+ if (s->nr == s->size) { -+ size_t new_size = max(8U, s->size * 2); -+ void *n = krealloc(s->d, -+ new_size * sizeof(s->d[0]), -+ GFP_KERNEL); -+ if (!n) { -+ pr_err("error allocating snapshot ID list"); -+ return -ENOMEM; -+ } -+ -+ s->d = n; -+ s->size = new_size; -+ }; -+ -+ s->d[s->nr++] = id; -+ return 0; ++ return darray_push(*s, id); +} + +static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, -+ struct snapshot_id_list *deleted, ++ snapshot_id_list *deleted, + enum btree_id btree_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; -+ struct snapshot_id_list equiv_seen = { 0 }; ++ snapshot_id_list equiv_seen = { 0 }; + struct bpos last_pos = POS_MIN; + int ret = 0; + @@ -65611,7 +68640,7 @@ index 000000000000..7e909a118189 + } + bch2_trans_iter_exit(trans, &iter); + -+ kfree(equiv_seen.d); ++ darray_exit(equiv_seen); + + return ret; +} @@ -65623,7 +68652,7 @@ index 000000000000..7e909a118189 + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; -+ struct snapshot_id_list deleted = { 0 }; ++ snapshot_id_list deleted = { 0 }; + u32 i, id, children[2]; + int ret = 0; + @@ -65703,15 +68732,15 @@ index 000000000000..7e909a118189 + + for (i = 0; i < deleted.nr; i++) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_snapshot_node_delete(&trans, deleted.d[i])); ++ bch2_snapshot_node_delete(&trans, deleted.data[i])); + if (ret) { + bch_err(c, "error deleting snapshot %u: %i", -+ deleted.d[i], ret); ++ deleted.data[i], ret); + goto err; + } + } +err: -+ kfree(deleted.d); ++ darray_exit(deleted); + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); +} @@ -65866,14 +68895,14 @@ index 000000000000..7e909a118189 +{ + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); -+ struct snapshot_id_list s; ++ snapshot_id_list s; + u32 *id; + int ret = 0; + + while (!ret) { + mutex_lock(&c->snapshots_unlinked_lock); + s = c->snapshots_unlinked; -+ memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); ++ darray_init(c->snapshots_unlinked); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (!s.nr) @@ -65881,7 +68910,7 @@ index 000000000000..7e909a118189 + + bch2_evict_subvolume_inodes(c, &s); + -+ for (id = s.d; id < s.d + s.nr; id++) { ++ for (id = s.data; id < s.data + s.nr; id++) { + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_delete(&trans, *id)); + if (ret) { @@ -65890,7 +68919,7 @@ index 000000000000..7e909a118189 + } + } + -+ kfree(s.d); ++ darray_exit(s); + } + + percpu_ref_put(&c->writes); @@ -66041,7 +69070,9 @@ index 000000000000..7e909a118189 + + if (src_subvolid) { + src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); -+ bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); ++ ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); ++ if (ret) ++ goto err; + } + + new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); @@ -66056,7 +69087,9 @@ index 000000000000..7e909a118189 + SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); + SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); + new_subvol->k.p = dst_iter.pos; -+ bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); ++ ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); ++ if (ret) ++ goto err; + + *new_subvolid = new_subvol->k.p.offset; + *new_snapshotid = new_nodes[0]; @@ -66076,14 +69109,15 @@ index 000000000000..7e909a118189 +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 -index 000000000000..e4c3fdcdf22f +index 000000000000..f609291acafa --- /dev/null +++ b/fs/bcachefs/subvolume.h -@@ -0,0 +1,132 @@ +@@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H + ++#include "darray.h" +#include "subvolume_types.h" + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -66140,15 +69174,13 @@ index 000000000000..e4c3fdcdf22f + +struct snapshots_seen { + struct bpos pos; -+ size_t nr; -+ size_t size; -+ u32 *d; ++ DARRAY(u32) ids; +}; + +static inline void snapshots_seen_exit(struct snapshots_seen *s) +{ -+ kfree(s->d); -+ s->d = NULL; ++ kfree(s->ids.data); ++ s->ids.data = NULL; +} + +static inline void snapshots_seen_init(struct snapshots_seen *s) @@ -66158,30 +69190,19 @@ index 000000000000..e4c3fdcdf22f + +static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) +{ -+ if (s->nr == s->size) { -+ size_t new_size = max(s->size, (size_t) 128) * 2; -+ u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); -+ -+ if (!d) { -+ bch_err(c, "error reallocating snapshots_seen table (new size %zu)", -+ new_size); -+ return -ENOMEM; -+ } -+ -+ s->size = new_size; -+ s->d = d; -+ } -+ -+ s->d[s->nr++] = id; -+ return 0; ++ int ret = darray_push(s->ids, id); ++ if (ret) ++ bch_err(c, "error reallocating snapshots_seen table (size %zu)", ++ s->ids.size); ++ return ret; +} + -+static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) ++static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) +{ -+ unsigned i; ++ u32 *i; + -+ for (i = 0; i < s->nr; i++) -+ if (id == s->d[i]) ++ darray_for_each(*s, i) ++ if (*i == id) + return true; + return false; +} @@ -66204,6 +69225,10 @@ index 000000000000..e4c3fdcdf22f + struct bch_subvolume *); +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + ++/* only exported for tests: */ ++int bch2_snapshot_node_create(struct btree_trans *, u32, ++ u32 *, u32 *, unsigned); ++ +int bch2_subvolume_delete(struct btree_trans *, u32); +int bch2_subvolume_unlink(struct btree_trans *, u32); +int bch2_subvolume_create(struct btree_trans *, u64, u32, @@ -66214,27 +69239,25 @@ index 000000000000..e4c3fdcdf22f +#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h new file mode 100644 -index 000000000000..9410b9587591 +index 000000000000..f7562b5d51df --- /dev/null +++ b/fs/bcachefs/subvolume_types.h -@@ -0,0 +1,11 @@ +@@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_TYPES_H +#define _BCACHEFS_SUBVOLUME_TYPES_H + -+struct snapshot_id_list { -+ u32 nr; -+ u32 size; -+ u32 *d; -+}; ++#include "darray.h" ++ ++typedef DARRAY(u32) snapshot_id_list; + +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c new file mode 100644 -index 000000000000..88a8e54fbd7a +index 000000000000..71abf87114df --- /dev/null +++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1202 @@ +@@ -0,0 +1,1601 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -66247,6 +69270,7 @@ index 000000000000..88a8e54fbd7a +#include "io.h" +#include "journal.h" +#include "journal_io.h" ++#include "journal_sb.h" +#include "journal_seq_blacklist.h" +#include "replicas.h" +#include "quota.h" @@ -66264,8 +69288,8 @@ index 000000000000..88a8e54fbd7a + NULL +}; + -+static const char *bch2_sb_field_validate(struct bch_sb *, -+ struct bch_sb_field *); ++static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, ++ struct printbuf *); + +struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, + enum bch_sb_field_type type) @@ -66439,22 +69463,31 @@ index 000000000000..88a8e54fbd7a + BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); +} + -+static const char *validate_sb_layout(struct bch_sb_layout *layout) ++static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) +{ + u64 offset, prev_offset, max_sectors; + unsigned i; + -+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) -+ return "Not a bcachefs superblock layout"; ++ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) { ++ pr_buf(out, "Not a bcachefs superblock layout"); ++ return -EINVAL; ++ } + -+ if (layout->layout_type != 0) -+ return "Invalid superblock layout type"; ++ if (layout->layout_type != 0) { ++ pr_buf(out, "Invalid superblock layout type %u", ++ layout->layout_type); ++ return -EINVAL; ++ } + -+ if (!layout->nr_superblocks) -+ return "Invalid superblock layout: no superblocks"; ++ if (!layout->nr_superblocks) { ++ pr_buf(out, "Invalid superblock layout: no superblocks"); ++ return -EINVAL; ++ } + -+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) -+ return "Invalid superblock layout: too many superblocks"; ++ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { ++ pr_buf(out, "Invalid superblock layout: too many superblocks"); ++ return -EINVAL; ++ } + + max_sectors = 1 << layout->sb_max_size_bits; + @@ -66463,126 +69496,163 @@ index 000000000000..88a8e54fbd7a + for (i = 1; i < layout->nr_superblocks; i++) { + offset = le64_to_cpu(layout->sb_offset[i]); + -+ if (offset < prev_offset + max_sectors) -+ return "Invalid superblock layout: superblocks overlap"; ++ if (offset < prev_offset + max_sectors) { ++ pr_buf(out, "Invalid superblock layout: superblocks overlap\n" ++ " (sb %u ends at %llu next starts at %llu", ++ i - 1, prev_offset + max_sectors, offset); ++ return -EINVAL; ++ } + prev_offset = offset; + } + -+ return NULL; ++ return 0; +} + -+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) ++static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, ++ int rw) +{ + struct bch_sb *sb = disk_sb->sb; + struct bch_sb_field *f; + struct bch_sb_field_members *mi; -+ const char *err; ++ enum bch_opt_id opt_id; + u32 version, version_min; + u16 block_size; ++ int ret; + + version = le16_to_cpu(sb->version); -+ version_min = version >= bcachefs_metadata_version_new_versioning ++ version_min = version >= bcachefs_metadata_version_bkey_renumber + ? le16_to_cpu(sb->version_min) + : version; + -+ if (version >= bcachefs_metadata_version_max || -+ version_min < bcachefs_metadata_version_min) -+ return "Unsupported superblock version"; ++ if (version >= bcachefs_metadata_version_max) { ++ pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", ++ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } + -+ if (version_min > version) -+ return "Bad minimum version"; ++ if (version_min < bcachefs_metadata_version_min) { ++ pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", ++ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } ++ ++ if (version_min > version) { ++ pr_buf(out, "Bad minimum version %u, greater than version field %u", ++ version_min, version); ++ return -EINVAL; ++ } + + if (sb->features[1] || -+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) -+ return "Filesystem has incompatible features"; ++ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { ++ pr_buf(out, "Filesystem has incompatible features"); ++ return -EINVAL; ++ } + + block_size = le16_to_cpu(sb->block_size); + -+ if (!is_power_of_2(block_size) || -+ block_size > PAGE_SECTORS) -+ return "Bad block size"; ++ if (block_size > PAGE_SECTORS) { ++ pr_buf(out, "Block size too big (got %u, max %u)", ++ block_size, PAGE_SECTORS); ++ return -EINVAL; ++ } + -+ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) -+ return "Bad user UUID"; ++ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) { ++ pr_buf(out, "Bad user UUID (got zeroes)"); ++ return -EINVAL; ++ } + -+ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) -+ return "Bad internal UUID"; ++ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) { ++ pr_buf(out, "Bad intenal UUID (got zeroes)"); ++ return -EINVAL; ++ } + + if (!sb->nr_devices || -+ sb->nr_devices <= sb->dev_idx || -+ sb->nr_devices > BCH_SB_MEMBERS_MAX) -+ return "Bad number of member devices"; ++ sb->nr_devices > BCH_SB_MEMBERS_MAX) { ++ pr_buf(out, "Bad number of member devices %u (max %u)", ++ sb->nr_devices, BCH_SB_MEMBERS_MAX); ++ return -EINVAL; ++ } + -+ if (!BCH_SB_META_REPLICAS_WANT(sb) || -+ BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) -+ return "Invalid number of metadata replicas"; -+ -+ if (!BCH_SB_META_REPLICAS_REQ(sb) || -+ BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) -+ return "Invalid number of metadata replicas"; -+ -+ if (!BCH_SB_DATA_REPLICAS_WANT(sb) || -+ BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) -+ return "Invalid number of data replicas"; -+ -+ if (!BCH_SB_DATA_REPLICAS_REQ(sb) || -+ BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) -+ return "Invalid number of data replicas"; -+ -+ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) -+ return "Invalid metadata checksum type"; -+ -+ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) -+ return "Invalid metadata checksum type"; -+ -+ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) -+ return "Invalid compression type"; -+ -+ if (!BCH_SB_BTREE_NODE_SIZE(sb)) -+ return "Btree node size not set"; -+ -+ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) -+ return "Btree node size not a power of two"; -+ -+ if (BCH_SB_GC_RESERVE(sb) < 5) -+ return "gc reserve percentage too small"; ++ if (sb->dev_idx >= sb->nr_devices) { ++ pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)", ++ sb->dev_idx, sb->nr_devices); ++ return -EINVAL; ++ } + + if (!sb->time_precision || -+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) -+ return "invalid time precision"; ++ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { ++ pr_buf(out, "Invalid time precision: %u (min 1, max %lu)", ++ le32_to_cpu(sb->time_precision), NSEC_PER_SEC); ++ return -EINVAL; ++ } ++ ++ if (rw == READ) { ++ /* ++ * Been seeing a bug where these are getting inexplicably ++ * zeroed, so we'r now validating them, but we have to be ++ * careful not to preven people's filesystems from mounting: ++ */ ++ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); ++ if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); ++ } ++ ++ for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { ++ const struct bch_option *opt = bch2_opt_table + opt_id; ++ ++ if (opt->get_sb != BCH2_NO_SB_OPT) { ++ u64 v = bch2_opt_from_sb(sb, opt_id); ++ ++ pr_buf(out, "Invalid option "); ++ ret = bch2_opt_validate(opt, v, out); ++ if (ret) ++ return ret; ++ ++ printbuf_reset(out); ++ } ++ } + + /* validate layout */ -+ err = validate_sb_layout(&sb->layout); -+ if (err) -+ return err; ++ ret = validate_sb_layout(&sb->layout, out); ++ if (ret) ++ return ret; + + vstruct_for_each(sb, f) { -+ if (!f->u64s) -+ return "Invalid superblock: invalid optional field"; ++ if (!f->u64s) { ++ pr_buf(out, "Invalid superblock: optional with size 0 (type %u)", ++ le32_to_cpu(f->type)); ++ return -EINVAL; ++ } + -+ if (vstruct_next(f) > vstruct_last(sb)) -+ return "Invalid superblock: invalid optional field"; ++ if (vstruct_next(f) > vstruct_last(sb)) { ++ pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", ++ le32_to_cpu(f->type)); ++ return -EINVAL; ++ } + } + + /* members must be validated first: */ + mi = bch2_sb_get_members(sb); -+ if (!mi) -+ return "Invalid superblock: member info area missing"; ++ if (!mi) { ++ pr_buf(out, "Invalid superblock: member info area missing"); ++ return -EINVAL; ++ } + -+ err = bch2_sb_field_validate(sb, &mi->field); -+ if (err) -+ return err; ++ ret = bch2_sb_field_validate(sb, &mi->field, out); ++ if (ret) ++ return ret; + + vstruct_for_each(sb, f) { + if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) + continue; + -+ err = bch2_sb_field_validate(sb, f); -+ if (err) -+ return err; ++ ret = bch2_sb_field_validate(sb, f, out); ++ if (ret) ++ return ret; + } + -+ return NULL; ++ return 0; +} + +/* device open: */ @@ -66603,7 +69673,6 @@ index 000000000000..88a8e54fbd7a + c->sb.nr_devices = src->nr_devices; + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); -+ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); + + c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); + c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; @@ -66645,7 +69714,7 @@ index 000000000000..88a8e54fbd7a + memcpy(dst->compat, src->compat, sizeof(dst->compat)); + + for (i = 0; i < BCH_SB_FIELD_NR; i++) { -+ if (i == BCH_SB_FIELD_journal) ++ if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) + continue; + + src_f = bch2_sb_field_get(src, i); @@ -66676,9 +69745,6 @@ index 000000000000..88a8e54fbd7a + + __copy_super(&c->disk_sb, src); + -+ if (BCH_SB_INITIALIZED(c->disk_sb.sb)) -+ set_bit(BCH_FS_INITIALIZED, &c->flags); -+ + ret = bch2_sb_replicas_to_cpu_replicas(c); + if (ret) + return ret; @@ -66712,10 +69778,12 @@ index 000000000000..88a8e54fbd7a + +/* read superblock: */ + -+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) ++static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) +{ + struct bch_csum csum; ++ u32 version, version_min; + size_t bytes; ++ int ret; +reread: + bio_reset(sb->bio); + bio_set_dev(sb->bio, sb->bdev); @@ -66723,40 +69791,65 @@ index 000000000000..88a8e54fbd7a + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); + -+ if (submit_bio_wait(sb->bio)) -+ return "IO error"; ++ ret = submit_bio_wait(sb->bio); ++ if (ret) { ++ pr_buf(err, "IO error: %i", ret); ++ return ret; ++ } + -+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) -+ return "Not a bcachefs superblock"; ++ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) { ++ pr_buf(err, "Not a bcachefs superblock"); ++ return -EINVAL; ++ } + -+ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || -+ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) -+ return "Unsupported superblock version"; ++ version = le16_to_cpu(sb->sb->version); ++ version_min = version >= bcachefs_metadata_version_bkey_renumber ++ ? le16_to_cpu(sb->sb->version_min) ++ : version; ++ ++ if (version >= bcachefs_metadata_version_max) { ++ pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", ++ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } ++ ++ if (version_min < bcachefs_metadata_version_min) { ++ pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", ++ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } + + bytes = vstruct_bytes(sb->sb); + -+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) -+ return "Bad superblock: too big"; ++ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { ++ pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", ++ bytes, 512UL << sb->sb->layout.sb_max_size_bits); ++ return -EINVAL; ++ } + + if (bytes > sb->buffer_size) { + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) -+ return "cannot allocate memory"; ++ return -ENOMEM; + goto reread; + } + -+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) -+ return "unknown csum type"; ++ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { ++ pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); ++ return -EINVAL; ++ } + + /* XXX: verify MACs */ + csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), + null_nonce(), sb->sb); + -+ if (bch2_crc_cmp(csum, sb->sb->csum)) -+ return "bad checksum reading superblock"; ++ if (bch2_crc_cmp(csum, sb->sb->csum)) { ++ pr_buf(err, "bad checksum"); ++ return -EINVAL; ++ } + + sb->seq = le64_to_cpu(sb->sb->seq); + -+ return NULL; ++ return 0; +} + +int bch2_read_super(const char *path, struct bch_opts *opts, @@ -66764,7 +69857,7 @@ index 000000000000..88a8e54fbd7a +{ + u64 offset = opt_get(*opts, sb); + struct bch_sb_layout layout; -+ const char *err; ++ struct printbuf err = PRINTBUF; + __le64 *i; + int ret; + @@ -66796,25 +69889,28 @@ index 000000000000..88a8e54fbd7a + goto out; + } + -+ err = "cannot allocate memory"; + ret = bch2_sb_realloc(sb, 0); -+ if (ret) ++ if (ret) { ++ pr_buf(&err, "error allocating memory for superblock"); + goto err; ++ } + -+ ret = -EFAULT; -+ err = "dynamic fault"; -+ if (bch2_fs_init_fault("read_super")) ++ if (bch2_fs_init_fault("read_super")) { ++ pr_buf(&err, "dynamic fault"); ++ ret = -EFAULT; + goto err; ++ } + -+ ret = -EINVAL; -+ err = read_one_super(sb, offset); -+ if (!err) ++ ret = read_one_super(sb, offset, &err); ++ if (!ret) + goto got_super; + + if (opt_defined(*opts, sb)) + goto err; + -+ pr_err("error reading default superblock: %s", err); ++ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", ++ path, err.buf); ++ printbuf_reset(&err); + + /* + * Error reading primary superblock - read location of backup @@ -66830,13 +69926,15 @@ index 000000000000..88a8e54fbd7a + */ + bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); + -+ err = "IO error"; -+ if (submit_bio_wait(sb->bio)) ++ ret = submit_bio_wait(sb->bio); ++ if (ret) { ++ pr_buf(&err, "IO error: %i", ret); + goto err; ++ } + + memcpy(&layout, sb->sb, sizeof(layout)); -+ err = validate_sb_layout(&layout); -+ if (err) ++ ret = validate_sb_layout(&layout, &err); ++ if (ret) + goto err; + + for (i = layout.sb_offset; @@ -66846,29 +69944,41 @@ index 000000000000..88a8e54fbd7a + if (offset == opt_get(*opts, sb)) + continue; + -+ err = read_one_super(sb, offset); -+ if (!err) ++ ret = read_one_super(sb, offset, &err); ++ if (!ret) + goto got_super; + } + -+ ret = -EINVAL; + goto err; + +got_super: -+ err = "Superblock block size smaller than device block size"; -+ ret = -EINVAL; + if (le16_to_cpu(sb->sb->block_size) << 9 < -+ bdev_logical_block_size(sb->bdev)) ++ bdev_logical_block_size(sb->bdev)) { ++ pr_buf(&err, "block size (%u) smaller than device block size (%u)", ++ le16_to_cpu(sb->sb->block_size) << 9, ++ bdev_logical_block_size(sb->bdev)); ++ ret = -EINVAL; + goto err; ++ } + + ret = 0; + sb->have_layout = true; ++ ++ ret = bch2_sb_validate(sb, &err, READ); ++ if (ret) { ++ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", ++ path, err.buf); ++ goto err_no_print; ++ } +out: + pr_verbose_init(*opts, "ret %i", ret); ++ printbuf_exit(&err); + return ret; +err: ++ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", ++ path, err.buf); ++err_no_print: + bch2_free_super(sb); -+ pr_err("error reading superblock: %s", err); + goto out; +} + @@ -66940,8 +70050,8 @@ index 000000000000..88a8e54fbd7a +{ + struct closure *cl = &c->sb_write; + struct bch_dev *ca; ++ struct printbuf err = PRINTBUF; + unsigned i, sb = 0, nr_wrote; -+ const char *err; + struct bch_devs_mask sb_written; + bool wrote, can_mount_without_written, can_mount_with_written; + unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; @@ -66968,10 +70078,12 @@ index 000000000000..88a8e54fbd7a + bch2_sb_from_fs(c, ca); + + for_each_online_member(ca, c, i) { -+ err = bch2_sb_validate(&ca->disk_sb); -+ if (err) { -+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err); -+ ret = -1; ++ printbuf_reset(&err); ++ ++ ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); ++ if (ret) { ++ bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); ++ percpu_ref_put(&ca->io_ref); + goto out; + } + } @@ -66989,11 +70101,24 @@ index 000000000000..88a8e54fbd7a + closure_sync(cl); + + for_each_online_member(ca, c, i) { -+ if (!ca->sb_write_error && -+ ca->disk_sb.seq != -+ le64_to_cpu(ca->sb_read_scratch->seq)) { ++ if (ca->sb_write_error) ++ continue; ++ ++ if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { + bch2_fs_fatal_error(c, -+ "Superblock modified by another process"); ++ "Superblock write was silently dropped! (seq %llu expected %llu)", ++ le64_to_cpu(ca->sb_read_scratch->seq), ++ ca->disk_sb.seq); ++ percpu_ref_put(&ca->io_ref); ++ ret = -EROFS; ++ goto out; ++ } ++ ++ if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { ++ bch2_fs_fatal_error(c, ++ "Superblock modified by another process (seq %llu expected %llu)", ++ le64_to_cpu(ca->sb_read_scratch->seq), ++ ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -EROFS; + goto out; @@ -67048,6 +70173,7 @@ index 000000000000..88a8e54fbd7a +out: + /* Make new options visible after they're persistent: */ + bch2_sb_update(c); ++ printbuf_exit(&err); + return ret; +} + @@ -67062,133 +70188,218 @@ index 000000000000..88a8e54fbd7a + mutex_unlock(&c->sb_lock); +} + -+/* BCH_SB_FIELD_journal: */ -+ -+static int u64_cmp(const void *_l, const void *_r) -+{ -+ u64 l = *((const u64 *) _l), r = *((const u64 *) _r); -+ -+ return l < r ? -1 : l > r ? 1 : 0; -+} -+ -+static const char *bch2_sb_validate_journal(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal *journal = field_to_type(f, journal); -+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; -+ const char *err; -+ unsigned nr; -+ unsigned i; -+ u64 *b; -+ -+ journal = bch2_sb_get_journal(sb); -+ if (!journal) -+ return NULL; -+ -+ nr = bch2_nr_journal_buckets(journal); -+ if (!nr) -+ return NULL; -+ -+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); -+ if (!b) -+ return "cannot allocate memory"; -+ -+ for (i = 0; i < nr; i++) -+ b[i] = le64_to_cpu(journal->buckets[i]); -+ -+ sort(b, nr, sizeof(u64), u64_cmp, NULL); -+ -+ err = "journal bucket at sector 0"; -+ if (!b[0]) -+ goto err; -+ -+ err = "journal bucket before first bucket"; -+ if (m && b[0] < le16_to_cpu(m->first_bucket)) -+ goto err; -+ -+ err = "journal bucket past end of device"; -+ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) -+ goto err; -+ -+ err = "duplicate journal buckets"; -+ for (i = 0; i + 1 < nr; i++) -+ if (b[i] == b[i + 1]) -+ goto err; -+ -+ err = NULL; -+err: -+ kfree(b); -+ return err; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_journal = { -+ .validate = bch2_sb_validate_journal, -+}; -+ +/* BCH_SB_FIELD_members: */ + -+static const char *bch2_sb_validate_members(struct bch_sb *sb, -+ struct bch_sb_field *f) ++static int bch2_sb_members_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) +{ + struct bch_sb_field_members *mi = field_to_type(f, members); -+ struct bch_member *m; ++ unsigned i; + + if ((void *) (mi->members + sb->nr_devices) > -+ vstruct_end(&mi->field)) -+ return "Invalid superblock: bad member info"; ++ vstruct_end(&mi->field)) { ++ pr_buf(err, "too many devices for section size"); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; + -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) { + if (!bch2_member_exists(m)) + continue; + -+ if (le64_to_cpu(m->nbuckets) > LONG_MAX) -+ return "Too many buckets"; ++ if (le64_to_cpu(m->nbuckets) > LONG_MAX) { ++ pr_buf(err, "device %u: too many buckets (got %llu, max %lu)", ++ i, le64_to_cpu(m->nbuckets), LONG_MAX); ++ return -EINVAL; ++ } + + if (le64_to_cpu(m->nbuckets) - -+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) -+ return "Not enough buckets"; ++ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { ++ pr_buf(err, "device %u: not enough buckets (got %llu, max %u)", ++ i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); ++ return -EINVAL; ++ } + + if (le16_to_cpu(m->bucket_size) < -+ le16_to_cpu(sb->block_size)) -+ return "bucket size smaller than block size"; ++ le16_to_cpu(sb->block_size)) { ++ pr_buf(err, "device %u: bucket size %u smaller than block size %u", ++ i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); ++ return -EINVAL; ++ } + + if (le16_to_cpu(m->bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(sb)) -+ return "bucket size smaller than btree node size"; ++ BCH_SB_BTREE_NODE_SIZE(sb)) { ++ pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu", ++ i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); ++ return -EINVAL; ++ } + } + -+ return NULL; ++ return 0; ++} ++ ++static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); ++ unsigned i; ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ unsigned data_have = bch2_sb_dev_has_data(sb, i); ++ u64 bucket_size = le16_to_cpu(m->bucket_size); ++ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ pr_buf(out, "Device:"); ++ pr_tab(out); ++ pr_buf(out, "%u", i); ++ pr_newline(out); ++ ++ pr_indent_push(out, 2); ++ ++ pr_buf(out, "UUID:"); ++ pr_tab(out); ++ pr_uuid(out, m->uuid.b); ++ pr_newline(out); ++ ++ pr_buf(out, "Size:"); ++ pr_tab(out); ++ pr_units(out, device_size, device_size << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "Bucket size:"); ++ pr_tab(out); ++ pr_units(out, bucket_size, bucket_size << 9); ++ pr_newline(out); ++ ++ pr_buf(out, "First bucket:"); ++ pr_tab(out); ++ pr_buf(out, "%u", le16_to_cpu(m->first_bucket)); ++ pr_newline(out); ++ ++ pr_buf(out, "Buckets:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", le64_to_cpu(m->nbuckets)); ++ pr_newline(out); ++ ++ pr_buf(out, "Last mount:"); ++ pr_tab(out); ++ if (m->last_mount) ++ pr_time(out, le64_to_cpu(m->last_mount)); ++ else ++ pr_buf(out, "(never)"); ++ pr_newline(out); ++ ++ pr_buf(out, "State:"); ++ pr_tab(out); ++ pr_buf(out, "%s", ++ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR ++ ? bch2_member_states[BCH_MEMBER_STATE(m)] ++ : "unknown"); ++ pr_newline(out); ++ ++ pr_buf(out, "Group:"); ++ pr_tab(out); ++ if (BCH_MEMBER_GROUP(m)) { ++ unsigned idx = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (idx < disk_groups_nr(gi)) ++ pr_buf(out, "%s (%u)", ++ gi->entries[idx].label, idx); ++ else ++ pr_buf(out, "(bad disk labels section)"); ++ } else { ++ pr_buf(out, "(none)"); ++ } ++ pr_newline(out); ++ ++ pr_buf(out, "Data allowed:"); ++ pr_tab(out); ++ if (BCH_MEMBER_DATA_ALLOWED(m)) ++ bch2_flags_to_text(out, bch2_data_types, ++ BCH_MEMBER_DATA_ALLOWED(m)); ++ else ++ pr_buf(out, "(none)"); ++ pr_newline(out); ++ ++ pr_buf(out, "Has data:"); ++ pr_tab(out); ++ if (data_have) ++ bch2_flags_to_text(out, bch2_data_types, data_have); ++ else ++ pr_buf(out, "(none)"); ++ pr_newline(out); ++ ++ pr_buf(out, "Discard:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m)); ++ pr_newline(out); ++ ++ pr_buf(out, "Freespace initialized:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); ++ pr_newline(out); ++ ++ pr_indent_pop(out, 2); ++ } +} + +static const struct bch_sb_field_ops bch_sb_field_ops_members = { -+ .validate = bch2_sb_validate_members, ++ .validate = bch2_sb_members_validate, ++ .to_text = bch2_sb_members_to_text, +}; + +/* BCH_SB_FIELD_crypt: */ + -+static const char *bch2_sb_validate_crypt(struct bch_sb *sb, -+ struct bch_sb_field *f) ++static int bch2_sb_crypt_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + -+ if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) -+ return "invalid field crypt: wrong size"; ++ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { ++ pr_buf(err, "wrong size (got %zu should be %zu)", ++ vstruct_bytes(&crypt->field), sizeof(*crypt)); ++ return -EINVAL; ++ } + -+ if (BCH_CRYPT_KDF_TYPE(crypt)) -+ return "invalid field crypt: bad kdf type"; ++ if (BCH_CRYPT_KDF_TYPE(crypt)) { ++ pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); ++ return -EINVAL; ++ } + -+ return NULL; ++ return 0; ++} ++ ++static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ pr_buf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); ++ pr_newline(out); ++ pr_buf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); ++ pr_newline(out); ++ pr_buf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); ++ pr_newline(out); ++ pr_buf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); ++ pr_newline(out); +} + +static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { -+ .validate = bch2_sb_validate_crypt, ++ .validate = bch2_sb_crypt_validate, ++ .to_text = bch2_sb_crypt_to_text, +}; + +/* BCH_SB_FIELD_clean: */ + -+int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) ++int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) +{ + struct jset_entry *entry; + int ret; @@ -67264,7 +70475,7 @@ index 000000000000..88a8e54fbd7a + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_INODES; ++ u->entry.btree_id = BCH_FS_USAGE_inodes; + u->v = cpu_to_le64(c->usage_base->nr_inodes); + } + @@ -67274,7 +70485,7 @@ index 000000000000..88a8e54fbd7a + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_KEY_VERSION; ++ u->entry.btree_id = BCH_FS_USAGE_key_version; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); + } + @@ -67284,7 +70495,7 @@ index 000000000000..88a8e54fbd7a + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_RESERVED; ++ u->entry.btree_id = BCH_FS_USAGE_reserved; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); + } @@ -67360,7 +70571,7 @@ index 000000000000..88a8e54fbd7a + } + + sb_clean->flags = 0; -+ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); ++ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); + + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); @@ -67377,7 +70588,7 @@ index 000000000000..88a8e54fbd7a + * this should be in the write path, and we should be validating every + * superblock section: + */ -+ ret = bch2_sb_clean_validate(c, sb_clean, WRITE); ++ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); + goto out; @@ -67388,19 +70599,47 @@ index 000000000000..88a8e54fbd7a + mutex_unlock(&c->sb_lock); +} + -+static const char *bch2_sb_validate_clean(struct bch_sb *sb, -+ struct bch_sb_field *f) ++static int bch2_sb_clean_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + -+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) -+ return "invalid field crypt: wrong size"; ++ if (vstruct_bytes(&clean->field) < sizeof(*clean)) { ++ pr_buf(err, "wrong size (got %zu should be %zu)", ++ vstruct_bytes(&clean->field), sizeof(*clean)); ++ return -EINVAL; ++ } + -+ return NULL; ++ return 0; ++} ++ ++static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ struct jset_entry *entry; ++ ++ pr_buf(out, "flags: %x", le32_to_cpu(clean->flags)); ++ pr_newline(out); ++ pr_buf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); ++ pr_newline(out); ++ ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ if (entry->type == BCH_JSET_ENTRY_btree_keys && ++ !entry->u64s) ++ continue; ++ ++ bch2_journal_entry_to_text(out, NULL, entry); ++ pr_newline(out); ++ } +} + +static const struct bch_sb_field_ops bch_sb_field_ops_clean = { -+ .validate = bch2_sb_validate_clean, ++ .validate = bch2_sb_clean_validate, ++ .to_text = bch2_sb_clean_to_text, +}; + +static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { @@ -67410,14 +70649,27 @@ index 000000000000..88a8e54fbd7a +#undef x +}; + -+static const char *bch2_sb_field_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) ++static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) +{ + unsigned type = le32_to_cpu(f->type); ++ struct printbuf field_err = PRINTBUF; ++ int ret; + -+ return type < BCH_SB_FIELD_NR -+ ? bch2_sb_field_ops[type]->validate(sb, f) -+ : NULL; ++ if (type >= BCH_SB_FIELD_NR) ++ return 0; ++ ++ ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err); ++ if (ret) { ++ pr_buf(err, "Invalid superblock section %s: %s", ++ bch2_sb_fields[type], ++ field_err.buf); ++ pr_newline(err); ++ bch2_sb_field_to_text(err, sb, f); ++ } ++ ++ printbuf_exit(&field_err); ++ return ret; +} + +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, @@ -67427,22 +70679,192 @@ index 000000000000..88a8e54fbd7a + const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR + ? bch2_sb_field_ops[type] : NULL; + ++ if (!out->tabstops[0]) ++ out->tabstops[0] = 32; ++ + if (ops) + pr_buf(out, "%s", bch2_sb_fields[type]); + else + pr_buf(out, "(unknown field %u)", type); + -+ pr_buf(out, " (size %llu):", vstruct_bytes(f)); ++ pr_buf(out, " (size %zu):", vstruct_bytes(f)); ++ pr_newline(out); + -+ if (ops && ops->to_text) ++ if (ops && ops->to_text) { ++ pr_indent_push(out, 2); + bch2_sb_field_ops[type]->to_text(out, sb, f); ++ pr_indent_pop(out, 2); ++ } ++} ++ ++void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) ++{ ++ unsigned i; ++ ++ pr_buf(out, "Type: %u", l->layout_type); ++ pr_newline(out); ++ ++ pr_buf(out, "Superblock max size: "); ++ pr_units(out, ++ 1 << l->sb_max_size_bits, ++ 512 << l->sb_max_size_bits); ++ pr_newline(out); ++ ++ pr_buf(out, "Nr superblocks: %u", l->nr_superblocks); ++ pr_newline(out); ++ ++ pr_buf(out, "Offsets: "); ++ for (i = 0; i < l->nr_superblocks; i++) { ++ if (i) ++ pr_buf(out, ", "); ++ pr_buf(out, "%llu", le64_to_cpu(l->sb_offset[i])); ++ } ++ pr_newline(out); ++} ++ ++void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, ++ bool print_layout, unsigned fields) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_sb_field *f; ++ u64 fields_have = 0; ++ unsigned nr_devices = 0; ++ ++ if (!out->tabstops[0]) ++ out->tabstops[0] = 32; ++ ++ mi = bch2_sb_get_members(sb); ++ if (mi) { ++ struct bch_member *m; ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) ++ nr_devices += bch2_member_exists(m); ++ } ++ ++ pr_buf(out, "External UUID:"); ++ pr_tab(out); ++ pr_uuid(out, sb->user_uuid.b); ++ pr_newline(out); ++ ++ pr_buf(out, "Internal UUID:"); ++ pr_tab(out); ++ pr_uuid(out, sb->uuid.b); ++ pr_newline(out); ++ ++ pr_buf(out, "Device index:"); ++ pr_tab(out); ++ pr_buf(out, "%u", sb->dev_idx); ++ pr_newline(out); ++ ++ pr_buf(out, "Label:"); ++ pr_tab(out); ++ pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label); ++ pr_newline(out); ++ ++ pr_buf(out, "Version:"); ++ pr_tab(out); ++ pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]); ++ pr_newline(out); ++ ++ pr_buf(out, "Oldest version on disk:"); ++ pr_tab(out); ++ pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]); ++ pr_newline(out); ++ ++ pr_buf(out, "Created:"); ++ pr_tab(out); ++ if (sb->time_base_lo) ++ pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); ++ else ++ pr_buf(out, "(not set)"); ++ pr_newline(out); ++ ++ pr_buf(out, "Sequence number:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", le64_to_cpu(sb->seq)); ++ pr_newline(out); ++ ++ pr_buf(out, "Superblock size:"); ++ pr_tab(out); ++ pr_buf(out, "%zu", vstruct_bytes(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Clean:"); ++ pr_tab(out); ++ pr_buf(out, "%llu", BCH_SB_CLEAN(sb)); ++ pr_newline(out); ++ ++ pr_buf(out, "Devices:"); ++ pr_tab(out); ++ pr_buf(out, "%u", nr_devices); ++ pr_newline(out); ++ ++ pr_buf(out, "Sections:"); ++ vstruct_for_each(sb, f) ++ fields_have |= 1 << le32_to_cpu(f->type); ++ pr_tab(out); ++ bch2_flags_to_text(out, bch2_sb_fields, fields_have); ++ pr_newline(out); ++ ++ pr_buf(out, "Features:"); ++ pr_tab(out); ++ bch2_flags_to_text(out, bch2_sb_features, ++ le64_to_cpu(sb->features[0])); ++ pr_newline(out); ++ ++ pr_buf(out, "Compat features:"); ++ pr_tab(out); ++ bch2_flags_to_text(out, bch2_sb_compat, ++ le64_to_cpu(sb->compat[0])); ++ pr_newline(out); ++ ++ pr_newline(out); ++ pr_buf(out, "Options:"); ++ pr_newline(out); ++ pr_indent_push(out, 2); ++ { ++ enum bch_opt_id id; ++ ++ for (id = 0; id < bch2_opts_nr; id++) { ++ const struct bch_option *opt = bch2_opt_table + id; ++ ++ if (opt->get_sb != BCH2_NO_SB_OPT) { ++ u64 v = bch2_opt_from_sb(sb, id); ++ ++ pr_buf(out, "%s:", opt->attr.name); ++ pr_tab(out); ++ bch2_opt_to_text(out, NULL, sb, opt, v, ++ OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); ++ pr_newline(out); ++ } ++ } ++ } ++ ++ pr_indent_pop(out, 2); ++ ++ if (print_layout) { ++ pr_newline(out); ++ pr_buf(out, "layout:"); ++ pr_newline(out); ++ pr_indent_push(out, 2); ++ bch2_sb_layout_to_text(out, &sb->layout); ++ pr_indent_pop(out, 2); ++ } ++ ++ vstruct_for_each(sb, f) ++ if (fields & (1 << le32_to_cpu(f->type))) { ++ pr_newline(out); ++ bch2_sb_field_to_text(out, sb, f); ++ } +} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h new file mode 100644 -index 000000000000..b64ac2fbbf8b +index 000000000000..14a25f6fe29a --- /dev/null +++ b/fs/bcachefs/super-io.h -@@ -0,0 +1,136 @@ +@@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_IO_H +#define _BCACHEFS_SUPER_IO_H @@ -67483,9 +70905,8 @@ index 000000000000..b64ac2fbbf8b +extern const char * const bch2_sb_fields[]; + +struct bch_sb_field_ops { -+ const char * (*validate)(struct bch_sb *, struct bch_sb_field *); -+ void (*to_text)(struct printbuf *, struct bch_sb *, -+ struct bch_sb_field *); ++ int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); ++ void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); +}; + +static inline __le64 bch2_sb_magic(struct bch_fs *c) @@ -67511,8 +70932,6 @@ index 000000000000..b64ac2fbbf8b +void bch2_free_super(struct bch_sb_handle *); +int bch2_sb_realloc(struct bch_sb_handle *, unsigned); + -+const char *bch2_sb_validate(struct bch_sb_handle *); -+ +int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); +int bch2_write_super(struct bch_fs *); +void __bch2_check_set_feature(struct bch_fs *, unsigned); @@ -67523,15 +70942,6 @@ index 000000000000..b64ac2fbbf8b + __bch2_check_set_feature(c, feat); +} + -+/* BCH_SB_FIELD_journal: */ -+ -+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -+{ -+ return j -+ ? (__le64 *) vstruct_end(&j->field) - j->buckets -+ : 0; -+} -+ +/* BCH_SB_FIELD_members: */ + +static inline bool bch2_member_exists(struct bch_member *m) @@ -67555,12 +70965,12 @@ index 000000000000..b64ac2fbbf8b + .bucket_size = le16_to_cpu(mi->bucket_size), + .group = BCH_MEMBER_GROUP(mi), + .state = BCH_MEMBER_STATE(mi), -+ .replacement = BCH_MEMBER_REPLACEMENT(mi), + .discard = BCH_MEMBER_DISCARD(mi), + .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), + .durability = BCH_MEMBER_DURABILITY(mi) + ? BCH_MEMBER_DURABILITY(mi) - 1 + : 1, ++ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), + }; +} @@ -67570,21 +70980,23 @@ index 000000000000..b64ac2fbbf8b +void bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry **, u64); + -+int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int); ++int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); + +int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_clean(struct bch_fs *); + +void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); ++void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); ++void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); + +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 -index 000000000000..3744b6d519a7 +index 000000000000..4a071711d363 --- /dev/null +++ b/fs/bcachefs/super.c -@@ -0,0 +1,2110 @@ +@@ -0,0 +1,1966 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and @@ -67603,6 +71015,7 @@ index 000000000000..3744b6d519a7 +#include "btree_key_cache.h" +#include "btree_update_interior.h" +#include "btree_io.h" ++#include "buckets_waiting_for_journal.h" +#include "chardev.h" +#include "checksum.h" +#include "clock.h" @@ -67785,17 +71198,9 @@ index 000000000000..3744b6d519a7 + */ + bch2_journal_flush_all_pins(&c->journal); + -+ /* -+ * If the allocator threads didn't all start up, the btree updates to -+ * write out alloc info aren't going to work: -+ */ -+ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) -+ goto nowrote_alloc; -+ + bch_verbose(c, "flushing journal and stopping allocators"); + + bch2_journal_flush_all_pins(&c->journal); -+ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); + + do { + clean_passes++; @@ -67820,17 +71225,11 @@ index 000000000000..3744b6d519a7 + bch_verbose(c, "flushing journal and stopping allocators complete"); + + set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -+nowrote_alloc: ++ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + flush_work(&c->btree_interior_update_work); + -+ for_each_member_device(ca, c, i) -+ bch2_dev_allocator_stop(ca); -+ -+ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); -+ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); -+ + bch2_fs_journal_stop(&c->journal); + + /* @@ -67866,10 +71265,6 @@ index 000000000000..3744b6d519a7 + /* + * Block new foreground-end write operations from starting - any new + * writes will return -EROFS: -+ * -+ * (This is really blocking new _allocations_, writes to previously -+ * allocated space can still happen until stopping the allocator in -+ * bch2_dev_allocator_stop()). + */ + percpu_ref_kill(&c->writes); + @@ -67998,19 +71393,7 @@ index 000000000000..3744b6d519a7 + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + -+ for_each_rw_member(ca, c, i) { -+ ret = bch2_dev_allocator_start(ca); -+ if (ret) { -+ bch_err(c, "error starting allocator threads"); -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } -+ -+ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_wake_allocator(ca); ++ bch2_do_discards(c); + + if (!early) { + ret = bch2_fs_read_write_late(c); @@ -68055,6 +71438,7 @@ index 000000000000..3744b6d519a7 + bch2_fs_ec_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_io_exit(c); ++ bch2_fs_buckets_waiting_for_journal_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_iter_exit(c); + bch2_fs_btree_key_cache_exit(&c->btree_key_cache); @@ -68259,6 +71643,7 @@ index 000000000000..3744b6d519a7 + INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + + init_rwsem(&c->gc_lock); ++ mutex_init(&c->gc_gens_lock); + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); @@ -68309,10 +71694,10 @@ index 000000000000..3744b6d519a7 + c->rebalance.enabled = 1; + c->promote_whole_extents = true; + -+ c->journal.write_time = &c->times[BCH_TIME_journal_write]; -+ c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; -+ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; -+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; ++ c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; ++ c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; ++ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; ++ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; + + bch2_fs_btree_cache_init_early(&c->btree_cache); + @@ -68329,13 +71714,32 @@ index 000000000000..3744b6d519a7 + if (ret) + goto err; + -+ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); ++ uuid_unparse_lower(c->sb.user_uuid.b, c->name); ++ ++ /* Compat: */ ++ if (sb->version <= bcachefs_metadata_version_inode_v2 && ++ !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); ++ ++ if (sb->version <= bcachefs_metadata_version_inode_v2 && ++ !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); + + c->opts = bch2_opts_default; -+ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); ++ ret = bch2_opts_from_sb(&c->opts, sb); ++ if (ret) ++ goto err; ++ + bch2_opts_apply(&c->opts, opts); + -+ c->block_bits = ilog2(c->opts.block_size); ++ /* key cache currently disabled for inodes, because of snapshots: */ ++ c->opts.inodes_use_key_cache = 0; ++ ++ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; ++ if (c->opts.inodes_use_key_cache) ++ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; ++ ++ c->block_bits = ilog2(block_sectors(c)); + c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); + + if (bch2_fs_init_fault("fs_alloc")) { @@ -68385,6 +71789,7 @@ index 000000000000..3744b6d519a7 + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_interior_update_init(c) ?: ++ bch2_fs_buckets_waiting_for_journal_init(c); + bch2_fs_subvolumes_init(c) ?: + bch2_fs_io_init(c) ?: + bch2_fs_encryption_init(c) ?: @@ -68394,9 +71799,6 @@ index 000000000000..3744b6d519a7 + if (ret) + goto err; + -+ if (c->opts.nochanges) -+ set_bit(JOURNAL_NOCHANGES, &c->journal.flags); -+ + mi = bch2_sb_get_members(c->disk_sb.sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && @@ -68432,12 +71834,9 @@ index 000000000000..3744b6d519a7 +static void print_mount_opts(struct bch_fs *c) +{ + enum bch_opt_id i; -+ char buf[512]; -+ struct printbuf p = PBUF(buf); ++ struct printbuf p = PRINTBUF; + bool first = true; + -+ strcpy(buf, "(null)"); -+ + if (c->opts.read_only) { + pr_buf(&p, "ro"); + first = false; @@ -68447,7 +71846,7 @@ index 000000000000..3744b6d519a7 + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + -+ if (!(opt->mode & OPT_MOUNT)) ++ if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) @@ -68456,10 +71855,14 @@ index 000000000000..3744b6d519a7 + if (!first) + pr_buf(&p, ","); + first = false; -+ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); ++ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); + } + -+ bch_info(c, "mounted with opts: %s", buf); ++ if (!p.pos) ++ pr_buf(&p, "(null)"); ++ ++ bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf); ++ printbuf_exit(&p); +} + +int bch2_fs_start(struct bch_fs *c) @@ -68507,20 +71910,6 @@ index 000000000000..3744b6d519a7 + + set_bit(BCH_FS_STARTED, &c->flags); + -+ /* -+ * Allocator threads don't start filling copygc reserve until after we -+ * set BCH_FS_STARTED - wake them now: -+ * -+ * XXX ugly hack: -+ * Need to set ca->allocator_state here instead of relying on the -+ * allocator threads to do it to avoid racing with the copygc threads -+ * checking it and thinking they have no alloc reserve: -+ */ -+ for_each_online_member(ca, c, i) { -+ ca->allocator_state = ALLOCATOR_running; -+ bch2_wake_allocator(ca); -+ } -+ + if (c->opts.read_only || c->opts.nochanges) { + bch2_fs_read_only(c); + } else { @@ -68573,7 +71962,7 @@ index 000000000000..3744b6d519a7 + if (!sb_mi) + return "Invalid superblock: member info area missing"; + -+ if (le16_to_cpu(sb->block_size) != c->opts.block_size) ++ if (le16_to_cpu(sb->block_size) != block_sectors(c)) + return "mismatched block size"; + + if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < @@ -68612,8 +72001,6 @@ index 000000000000..3744b6d519a7 + +static void bch2_dev_free(struct bch_dev *ca) +{ -+ bch2_dev_allocator_stop(ca); -+ + cancel_work_sync(&ca->io_error_work); + + if (ca->kobj.state_in_sysfs && @@ -68728,8 +72115,8 @@ index 000000000000..3744b6d519a7 + ca->mi = bch2_mi_to_cpu(member); + ca->uuid = member->uuid; + -+ if (opt_defined(c->opts, discard)) -+ ca->mi.discard = opt_get(c->opts, discard); ++ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ++ ca->mi.bucket_size / btree_sectors(c)); + + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, + 0, GFP_KERNEL) || @@ -68780,12 +72167,6 @@ index 000000000000..3744b6d519a7 + + ca->fs = c; + -+ if (ca->mi.state == BCH_MEMBER_STATE_rw && -+ bch2_dev_allocator_start(ca)) { -+ bch2_dev_free(ca); -+ goto err; -+ } -+ + bch2_dev_attach(c, ca, dev_idx); +out: + pr_verbose_init(c->opts, "ret %i", ret); @@ -68831,6 +72212,8 @@ index 000000000000..3744b6d519a7 + ca->disk_sb.bdev->bd_holder = ca; + memset(sb, 0, sizeof(*sb)); + ++ ca->dev = ca->disk_sb.bdev->bd_dev; ++ + percpu_ref_reinit(&ca->io_ref); + + return 0; @@ -68969,14 +72352,13 @@ index 000000000000..3744b6d519a7 + /* + * The allocator thread itself allocates btree nodes, so stop it first: + */ -+ bch2_dev_allocator_stop(ca); + bch2_dev_allocator_remove(c, ca); + bch2_dev_journal_stop(&c->journal, ca); + + bch2_copygc_start(c); +} + -+static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ++static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +{ + lockdep_assert_held(&c->state_lock); + @@ -68984,8 +72366,6 @@ index 000000000000..3744b6d519a7 + + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); -+ -+ return bch2_dev_allocator_start(ca); +} + +int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -69012,7 +72392,7 @@ index 000000000000..3744b6d519a7 + mutex_unlock(&c->sb_lock); + + if (new_state == BCH_MEMBER_STATE_rw) -+ ret = __bch2_dev_read_write(c, ca); ++ __bch2_dev_read_write(c, ca); + + rebalance_wakeup(c); + @@ -69035,30 +72415,20 @@ index 000000000000..3744b6d519a7 + +static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) +{ -+ struct btree_trans trans; -+ size_t i; ++ struct bpos start = POS(ca->dev_idx, 0); ++ struct bpos end = POS(ca->dev_idx, U64_MAX); + int ret; + -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < ca->mi.nbuckets; i++) { -+ ret = lockrestart_do(&trans, -+ bch2_btree_key_cache_flush(&trans, -+ BTREE_ID_alloc, POS(ca->dev_idx, i))); -+ if (ret) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ if (ret) { ++ ret = bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, ++ BTREE_TRIGGER_NORUN, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, ++ BTREE_TRIGGER_NORUN, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, ++ BTREE_TRIGGER_NORUN, NULL); ++ if (ret) + bch_err(c, "error %i removing dev alloc info", ret); -+ return ret; -+ } + -+ return bch2_btree_delete_range(c, BTREE_ID_alloc, -+ POS(ca->dev_idx, 0), -+ POS(ca->dev_idx + 1, 0), -+ NULL); ++ return ret; +} + +int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) @@ -69123,11 +72493,11 @@ index 000000000000..3744b6d519a7 + + data = bch2_dev_has_data(c, ca); + if (data) { -+ char data_has_str[100]; ++ struct printbuf data_has = PRINTBUF; + -+ bch2_flags_to_text(&PBUF(data_has_str), -+ bch2_data_types, data); -+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ++ bch2_flags_to_text(&data_has, bch2_data_types, data); ++ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); ++ printbuf_exit(&data_has); + ret = -EBUSY; + goto err; + } @@ -69175,69 +72545,59 @@ index 000000000000..3744b6d519a7 + struct bch_dev *ca = NULL; + struct bch_sb_field_members *mi; + struct bch_member dev_mi; -+ struct bucket_array *buckets; -+ struct bucket *g; + unsigned dev_idx, nr_devices, u64s; ++ struct printbuf errbuf = PRINTBUF; + int ret; + + ret = bch2_read_super(path, &opts, &sb); -+ if (ret) -+ return ret; -+ -+ err = bch2_sb_validate(&sb); -+ if (err) -+ return -EINVAL; ++ if (ret) { ++ bch_err(c, "device add error: error reading super: %i", ret); ++ goto err; ++ } + + dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; + + err = bch2_dev_may_add(sb.sb, c); -+ if (err) -+ return -EINVAL; ++ if (err) { ++ bch_err(c, "device add error: %s", err); ++ ret = -EINVAL; ++ goto err; ++ } + + ca = __bch2_dev_alloc(c, &dev_mi); + if (!ca) { + bch2_free_super(&sb); -+ return -ENOMEM; ++ ret = -ENOMEM; ++ goto err; + } + + ret = __bch2_dev_attach_bdev(ca, &sb); + if (ret) { + bch2_dev_free(ca); -+ return ret; ++ goto err; + } + -+ /* -+ * We want to allocate journal on the new device before adding the new -+ * device to the filesystem because allocating after we attach requires -+ * spinning up the allocator thread, and the allocator thread requires -+ * doing btree writes, which if the existing devices are RO isn't going -+ * to work -+ * -+ * So we have to mark where the superblocks are, but marking allocated -+ * data normally updates the filesystem usage too, so we have to mark, -+ * allocate the journal, reset all the marks, then remark after we -+ * attach... -+ */ -+ bch2_mark_dev_superblock(NULL, ca, 0); -+ -+ err = "journal alloc failed"; + ret = bch2_dev_journal_alloc(ca); -+ if (ret) ++ if (ret) { ++ bch_err(c, "device add error: journal alloc failed"); + goto err; ++ } + + down_write(&c->state_lock); + mutex_lock(&c->sb_lock); + -+ err = "insufficient space in new superblock"; + ret = bch2_sb_from_fs(c, ca); -+ if (ret) ++ if (ret) { ++ bch_err(c, "device add error: new device superblock too small"); + goto err_unlock; ++ } + + mi = bch2_sb_get_members(ca->disk_sb.sb); + + if (!bch2_sb_resize_members(&ca->disk_sb, + le32_to_cpu(mi->field.u64s) + + sizeof(dev_mi) / sizeof(u64))) { ++ bch_err(c, "device add error: new device superblock too small"); + ret = -ENOSPC; + goto err_unlock; + } @@ -69250,7 +72610,7 @@ index 000000000000..3744b6d519a7 + if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) + goto have_slot; +no_slot: -+ err = "no slots available in superblock"; ++ bch_err(c, "device add error: already have maximum number of devices"); + ret = -ENOSPC; + goto err_unlock; + @@ -69259,12 +72619,12 @@ index 000000000000..3744b6d519a7 + u64s = (sizeof(struct bch_sb_field_members) + + sizeof(struct bch_member) * nr_devices) / sizeof(u64); + -+ err = "no space in superblock for member info"; -+ ret = -ENOSPC; -+ + mi = bch2_sb_resize_members(&c->disk_sb, u64s); -+ if (!mi) ++ if (!mi) { ++ bch_err(c, "device add error: no room in superblock for member info"); ++ ret = -ENOSPC; + goto err_unlock; ++ } + + /* success: */ + @@ -69280,27 +72640,23 @@ index 000000000000..3744b6d519a7 + + bch2_dev_usage_journal_reserve(c); + -+ /* -+ * Clear marks before marking transactionally in the btree, so that -+ * per-device accounting gets done correctly: -+ */ -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ for_each_bucket(g, buckets) -+ atomic64_set(&g->_mark.v, 0); -+ up_read(&ca->bucket_lock); -+ -+ err = "error marking superblock"; + ret = bch2_trans_mark_dev_sb(c, ca); -+ if (ret) ++ if (ret) { ++ bch_err(c, "device add error: error marking new superblock: %i", ret); + goto err_late; -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_rw) { -+ ret = __bch2_dev_read_write(c, ca); -+ if (ret) -+ goto err_late; + } + ++ ret = bch2_fs_freespace_init(c); ++ if (ret) { ++ bch_err(c, "device add error: error initializing free space: %i", ret); ++ goto err_late; ++ } ++ ++ ca->new_fs_bucket_idx = 0; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_write(c, ca); ++ + up_write(&c->state_lock); + return 0; + @@ -69311,12 +72667,12 @@ index 000000000000..3744b6d519a7 + if (ca) + bch2_dev_free(ca); + bch2_free_super(&sb); -+ bch_err(c, "Unable to add device: %s", err); ++ printbuf_exit(&errbuf); + return ret; +err_late: + up_write(&c->state_lock); -+ bch_err(c, "Error going rw after adding device: %s", err); -+ return -EINVAL; ++ ca = NULL; ++ goto err; +} + +/* Hot add existing device to running filesystem: */ @@ -69359,11 +72715,8 @@ index 000000000000..3744b6d519a7 + goto err; + } + -+ if (ca->mi.state == BCH_MEMBER_STATE_rw) { -+ ret = __bch2_dev_read_write(c, ca); -+ if (ret) -+ goto err; -+ } ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_write(c, ca); + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); @@ -69450,20 +72803,14 @@ index 000000000000..3744b6d519a7 +} + +/* return with ref on ca->ref: */ -+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) ++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) +{ + struct bch_dev *ca; -+ dev_t dev; + unsigned i; -+ int ret; -+ -+ ret = lookup_bdev(path, &dev); -+ if (ret) -+ return ERR_PTR(ret); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) -+ if (ca->disk_sb.bdev->bd_dev == dev) ++ if (!strcmp(name, ca->name)) + goto found; + ca = ERR_PTR(-ENOENT); +found: @@ -69482,18 +72829,17 @@ index 000000000000..3744b6d519a7 + struct bch_sb_field_members *mi; + unsigned i, best_sb = 0; + const char *err; ++ struct printbuf errbuf = PRINTBUF; + int ret = 0; + ++ if (!try_module_get(THIS_MODULE)) ++ return ERR_PTR(-ENODEV); ++ + pr_verbose_init(opts, ""); + + if (!nr_devices) { -+ c = ERR_PTR(-EINVAL); -+ goto out2; -+ } -+ -+ if (!try_module_get(THIS_MODULE)) { -+ c = ERR_PTR(-ENODEV); -+ goto out2; ++ ret = -EINVAL; ++ goto err; + } + + sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); @@ -69507,9 +72853,6 @@ index 000000000000..3744b6d519a7 + if (ret) + goto err; + -+ err = bch2_sb_validate(&sb[i]); -+ if (err) -+ goto err_print; + } + + for (i = 1; i < nr_devices; i++) @@ -69564,8 +72907,8 @@ index 000000000000..3744b6d519a7 + } +out: + kfree(sb); ++ printbuf_exit(&errbuf); + module_put(THIS_MODULE); -+out2: + pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); + return c; +err_print: @@ -69582,81 +72925,6 @@ index 000000000000..3744b6d519a7 + goto out; +} + -+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, -+ struct bch_opts opts) -+{ -+ const char *err; -+ struct bch_fs *c; -+ bool allocated_fs = false; -+ int ret; -+ -+ err = bch2_sb_validate(sb); -+ if (err) -+ return err; -+ -+ mutex_lock(&bch_fs_list_lock); -+ c = __bch2_uuid_to_fs(sb->sb->uuid); -+ if (c) { -+ closure_get(&c->cl); -+ -+ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); -+ if (err) -+ goto err; -+ } else { -+ allocated_fs = true; -+ c = bch2_fs_alloc(sb->sb, opts); -+ -+ err = "bch2_fs_alloc() error"; -+ if (IS_ERR(c)) -+ goto err; -+ } -+ -+ err = "bch2_dev_online() error"; -+ -+ mutex_lock(&c->sb_lock); -+ if (bch2_dev_attach_bdev(c, sb)) { -+ mutex_unlock(&c->sb_lock); -+ goto err; -+ } -+ mutex_unlock(&c->sb_lock); -+ -+ if (!c->opts.nostart && bch2_fs_may_start(c)) { -+ err = "error starting filesystem"; -+ ret = bch2_fs_start(c); -+ if (ret) -+ goto err; -+ } -+ -+ closure_put(&c->cl); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return NULL; -+err: -+ mutex_unlock(&bch_fs_list_lock); -+ -+ if (allocated_fs && !IS_ERR(c)) -+ bch2_fs_stop(c); -+ else if (c) -+ closure_put(&c->cl); -+ -+ return err; -+} -+ -+const char *bch2_fs_open_incremental(const char *path) -+{ -+ struct bch_sb_handle sb; -+ struct bch_opts opts = bch2_opts_empty(); -+ const char *err; -+ -+ if (bch2_read_super(path, &opts, &sb)) -+ return "error reading superblock"; -+ -+ err = __bch2_fs_open_incremental(&sb, opts); -+ bch2_free_super(&sb); -+ -+ return err; -+} -+ +/* Global interfaces/init */ + +static void bcachefs_exit(void) @@ -69697,10 +72965,10 @@ index 000000000000..3744b6d519a7 +module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h new file mode 100644 -index 000000000000..739e8fd18176 +index 000000000000..6d3efda26e63 --- /dev/null +++ b/fs/bcachefs/super.h -@@ -0,0 +1,238 @@ +@@ -0,0 +1,264 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_H +#define _BCACHEFS_SUPER_H @@ -69729,6 +72997,12 @@ index 000000000000..739e8fd18176 + return remainder; +} + ++static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, ++ u32 *offset) ++{ ++ return div_u64_rem(s, ca->mi.bucket_size, offset); ++} ++ +static inline bool bch2_dev_is_online(struct bch_dev *ca) +{ + return !percpu_ref_is_zero(&ca->io_ref); @@ -69897,6 +73171,27 @@ index 000000000000..739e8fd18176 + return devs; +} + ++static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ u64 b_offset = bucket_to_sector(ca, b); ++ u64 b_end = bucket_to_sector(ca, b + 1); ++ unsigned i; ++ ++ if (!b) ++ return true; ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ u64 end = offset + (1 << layout->sb_max_size_bits); ++ ++ if (!(offset >= b_end || end <= b_offset)) ++ return true; ++ } ++ ++ return false; ++} ++ +struct bch_fs *bch2_dev_to_fs(dev_t); +struct bch_fs *bch2_uuid_to_fs(uuid_le); + @@ -69936,12 +73231,11 @@ index 000000000000..739e8fd18176 + +int bch2_fs_start(struct bch_fs *); +struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); -+const char *bch2_fs_open_incremental(const char *path); + +#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h new file mode 100644 -index 000000000000..96023f37afea +index 000000000000..89419fc7930d --- /dev/null +++ b/fs/bcachefs/super_types.h @@ -0,0 +1,51 @@ @@ -69976,10 +73270,10 @@ index 000000000000..96023f37afea + u16 bucket_size; /* sectors */ + u16 group; + u8 state; -+ u8 replacement; + u8 discard; + u8 data_allowed; + u8 durability; ++ u8 freespace_initialized; + u8 valid; +}; + @@ -69998,10 +73292,10 @@ index 000000000000..96023f37afea +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 -index 000000000000..864be8601868 +index 000000000000..2594fec4b821 --- /dev/null +++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,1009 @@ +@@ -0,0 +1,889 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache sysfs interfaces @@ -70014,6 +73308,7 @@ index 000000000000..864be8601868 + +#include "bcachefs.h" +#include "alloc_background.h" ++#include "alloc_foreground.h" +#include "sysfs.h" +#include "btree_cache.h" +#include "btree_io.h" @@ -70049,8 +73344,28 @@ index 000000000000..864be8601868 +} + +#define SHOW(fn) \ ++static ssize_t fn ## _to_text(struct printbuf *, \ ++ struct kobject *, struct attribute *);\ ++ \ +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ ++{ \ ++ struct printbuf out = PRINTBUF; \ ++ ssize_t ret = fn ## _to_text(&out, kobj, attr); \ ++ \ ++ if (!ret && out.allocation_failure) \ ++ ret = -ENOMEM; \ ++ \ ++ if (!ret) { \ ++ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ ++ memcpy(buf, out.buf, ret); \ ++ } \ ++ printbuf_exit(&out); \ ++ return ret; \ ++} \ ++ \ ++static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ ++ struct attribute *attr) + +#define STORE(fn) \ +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ @@ -70067,22 +73382,19 @@ index 000000000000..864be8601868 +#define sysfs_printf(file, fmt, ...) \ +do { \ + if (attr == &sysfs_ ## file) \ -+ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ ++ pr_buf(out, fmt "\n", __VA_ARGS__); \ +} while (0) + +#define sysfs_print(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ -+ return snprint(buf, PAGE_SIZE, var); \ ++ snprint(out, var); \ +} while (0) + +#define sysfs_hprint(file, val) \ +do { \ -+ if (attr == &sysfs_ ## file) { \ -+ bch2_hprint(&out, val); \ -+ pr_buf(&out, "\n"); \ -+ return out.pos - buf; \ -+ } \ ++ if (attr == &sysfs_ ## file) \ ++ bch2_hprint(out, val); \ +} while (0) + +#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) @@ -70135,7 +73447,6 @@ index 000000000000..864be8601868 + return strtoi_h(buf, &var) ?: (ssize_t) size; \ +} while (0) + -+write_attribute(trigger_journal_flush); +write_attribute(trigger_gc); +write_attribute(prune_cache); +rw_attribute(btree_gc_periodic); @@ -70144,8 +73455,6 @@ index 000000000000..864be8601868 +read_attribute(uuid); +read_attribute(minor); +read_attribute(bucket_size); -+read_attribute(block_size); -+read_attribute(btree_node_size); +read_attribute(first_bucket); +read_attribute(nbuckets); +read_attribute(durability); @@ -70159,13 +73468,10 @@ index 000000000000..864be8601868 + +read_attribute(btree_avg_write_size); + -+read_attribute(reserve_stats); +read_attribute(btree_cache_size); +read_attribute(compression_stats); +read_attribute(journal_debug); -+read_attribute(journal_pins); +read_attribute(btree_updates); -+read_attribute(dirty_btree_nodes); +read_attribute(btree_cache); +read_attribute(btree_key_cache); +read_attribute(btree_transactions); @@ -70176,17 +73482,13 @@ index 000000000000..864be8601868 + +read_attribute(has_data); +read_attribute(alloc_debug); -+write_attribute(wake_allocator); + +read_attribute(read_realloc_races); +read_attribute(extent_migrate_done); +read_attribute(extent_migrate_raced); -+ -+rw_attribute(journal_write_delay_ms); -+rw_attribute(journal_reclaim_delay_ms); ++read_attribute(bucket_alloc_fail); + +rw_attribute(discard); -+rw_attribute(cache_replacement_policy); +rw_attribute(label); + +rw_attribute(copy_gc_enabled); @@ -70202,7 +73504,7 @@ index 000000000000..864be8601868 +read_attribute(io_timers_read); +read_attribute(io_timers_write); + -+read_attribute(data_op_data_progress); ++read_attribute(data_jobs); + +#ifdef CONFIG_BCACHEFS_TESTS +write_attribute(perf_test); @@ -70240,59 +73542,36 @@ index 000000000000..864be8601868 + return nr ? div64_u64(sectors, nr) : 0; +} + -+static long stats_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bch_move_stats *stats) -+{ -+ pr_buf(out, "%s: data type %s btree_id %s position: ", -+ stats->name, -+ bch2_data_types[stats->data_type], -+ bch2_btree_ids[stats->btree_id]); -+ bch2_bpos_to_text(out, stats->pos); -+ pr_buf(out, "%s", "\n"); -+ -+ return 0; -+} -+ +static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) +{ + long ret = 0; -+ struct bch_move_stats *iter; ++ struct bch_move_stats *stats; + + mutex_lock(&c->data_progress_lock); -+ -+ if (list_empty(&c->data_progress_list)) -+ pr_buf(out, "%s", "no progress to report\n"); -+ else -+ list_for_each_entry(iter, &c->data_progress_list, list) { -+ stats_to_text(out, c, iter); -+ } ++ list_for_each_entry(stats, &c->data_progress_list, list) { ++ pr_buf(out, "%s: data type %s btree_id %s position: ", ++ stats->name, ++ bch2_data_types[stats->data_type], ++ bch2_btree_ids[stats->btree_id]); ++ bch2_bpos_to_text(out, stats->pos); ++ pr_buf(out, "%s", "\n"); ++ } + + mutex_unlock(&c->data_progress_lock); + return ret; +} + -+static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); -+ -+ if (!fs_usage) -+ return -ENOMEM; -+ -+ bch2_fs_usage_to_text(out, c, fs_usage); -+ -+ percpu_up_read(&c->mark_lock); -+ -+ kfree(fs_usage); -+ return 0; -+} -+ +static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, ++ enum btree_id id; ++ u64 nr_uncompressed_extents = 0, + nr_compressed_extents = 0, ++ nr_incompressible_extents = 0, ++ uncompressed_sectors = 0, ++ incompressible_sectors = 0, + compressed_sectors_compressed = 0, + compressed_sectors_uncompressed = 0; + int ret; @@ -70302,47 +73581,72 @@ index 000000000000..864be8601868 + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) -+ if (k.k->type == KEY_TYPE_extent) { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ if (!((1U << id) & BTREE_ID_HAS_PTRS)) ++ continue; ++ ++ for_each_btree_key(&trans, iter, id, POS_MIN, ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; ++ bool compressed = false, uncompressed = false, incompressible = false; + -+ extent_for_each_ptr_decode(e, p, entry) { -+ if (!crc_is_compressed(p.crc)) { -+ nr_uncompressed_extents++; -+ uncompressed_sectors += e.k->size; -+ } else { -+ nr_compressed_extents++; ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ switch (p.crc.compression_type) { ++ case BCH_COMPRESSION_TYPE_none: ++ uncompressed = true; ++ uncompressed_sectors += k.k->size; ++ break; ++ case BCH_COMPRESSION_TYPE_incompressible: ++ incompressible = true; ++ incompressible_sectors += k.k->size; ++ break; ++ default: + compressed_sectors_compressed += + p.crc.compressed_size; + compressed_sectors_uncompressed += + p.crc.uncompressed_size; ++ compressed = true; ++ break; + } -+ -+ /* only looking at the first ptr */ -+ break; + } ++ ++ if (incompressible) ++ nr_incompressible_extents++; ++ else if (uncompressed) ++ nr_uncompressed_extents++; ++ else if (compressed) ++ nr_compressed_extents++; + } -+ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_iter_exit(&trans, &iter); ++ } + + bch2_trans_exit(&trans); ++ + if (ret) + return ret; + -+ pr_buf(out, -+ "uncompressed data:\n" -+ " nr extents: %llu\n" -+ " size (bytes): %llu\n" -+ "compressed data:\n" -+ " nr extents: %llu\n" -+ " compressed size (bytes): %llu\n" -+ " uncompressed size (bytes): %llu\n", -+ nr_uncompressed_extents, -+ uncompressed_sectors << 9, -+ nr_compressed_extents, -+ compressed_sectors_compressed << 9, -+ compressed_sectors_uncompressed << 9); ++ pr_buf(out, "uncompressed:\n"); ++ pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents); ++ pr_buf(out, " size: "); ++ bch2_hprint(out, uncompressed_sectors << 9); ++ pr_buf(out, "\n"); ++ ++ pr_buf(out, "compressed:\n"); ++ pr_buf(out, " nr extents: %llu\n", nr_compressed_extents); ++ pr_buf(out, " compressed size: "); ++ bch2_hprint(out, compressed_sectors_compressed << 9); ++ pr_buf(out, "\n"); ++ pr_buf(out, " uncompressed size: "); ++ bch2_hprint(out, compressed_sectors_uncompressed << 9); ++ pr_buf(out, "\n"); ++ ++ pr_buf(out, "incompressible:\n"); ++ pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents); ++ pr_buf(out, " size: "); ++ bch2_hprint(out, incompressible_sectors << 9); ++ pr_buf(out, "\n"); + return 0; +} + @@ -70356,16 +73660,10 @@ index 000000000000..864be8601868 +SHOW(bch2_fs) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); + + sysfs_print(minor, c->minor); + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + -+ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); -+ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); -+ -+ sysfs_print(block_size, block_bytes(c)); -+ sysfs_print(btree_node_size, btree_bytes(c)); + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); + sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); + @@ -70375,13 +73673,13 @@ index 000000000000..864be8601868 + atomic_long_read(&c->extent_migrate_done)); + sysfs_print(extent_migrate_raced, + atomic_long_read(&c->extent_migrate_raced)); ++ sysfs_print(bucket_alloc_fail, ++ atomic_long_read(&c->bucket_alloc_fail)); + + sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); + -+ if (attr == &sysfs_gc_gens_pos) { -+ bch2_gc_gens_pos_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_gc_gens_pos) ++ bch2_gc_gens_pos_to_text(out, c); + + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + @@ -70391,86 +73689,48 @@ index 000000000000..864be8601868 + max(0LL, c->copygc_wait - + atomic64_read(&c->io_clock[WRITE].now)) << 9); + -+ if (attr == &sysfs_rebalance_work) { -+ bch2_rebalance_work_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_rebalance_work) ++ bch2_rebalance_work_to_text(out, c); + + sysfs_print(promote_whole_extents, c->promote_whole_extents); + + /* Debugging: */ + -+ if (attr == &sysfs_alloc_debug) -+ return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; ++ if (attr == &sysfs_journal_debug) ++ bch2_journal_debug_to_text(out, &c->journal); + -+ if (attr == &sysfs_journal_debug) { -+ bch2_journal_debug_to_text(&out, &c->journal); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_btree_updates) ++ bch2_btree_updates_to_text(out, c); + -+ if (attr == &sysfs_journal_pins) { -+ bch2_journal_pins_to_text(&out, &c->journal); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_btree_cache) ++ bch2_btree_cache_to_text(out, c); + -+ if (attr == &sysfs_btree_updates) { -+ bch2_btree_updates_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_btree_key_cache) ++ bch2_btree_key_cache_to_text(out, &c->btree_key_cache); + -+ if (attr == &sysfs_dirty_btree_nodes) { -+ bch2_dirty_btree_nodes_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_btree_transactions) ++ bch2_btree_trans_to_text(out, c); + -+ if (attr == &sysfs_btree_cache) { -+ bch2_btree_cache_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_stripes_heap) ++ bch2_stripes_heap_to_text(out, c); + -+ if (attr == &sysfs_btree_key_cache) { -+ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_open_buckets) ++ bch2_open_buckets_to_text(out, c); + -+ if (attr == &sysfs_btree_transactions) { -+ bch2_btree_trans_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_compression_stats) ++ bch2_compression_stats_to_text(out, c); + -+ if (attr == &sysfs_stripes_heap) { -+ bch2_stripes_heap_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_new_stripes) ++ bch2_new_stripes_to_text(out, c); + -+ if (attr == &sysfs_open_buckets) { -+ bch2_open_buckets_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_io_timers_read) ++ bch2_io_timers_to_text(out, &c->io_clock[READ]); + -+ if (attr == &sysfs_compression_stats) { -+ bch2_compression_stats_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_io_timers_write) ++ bch2_io_timers_to_text(out, &c->io_clock[WRITE]); + -+ if (attr == &sysfs_new_stripes) { -+ bch2_new_stripes_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_io_timers_read) { -+ bch2_io_timers_to_text(&out, &c->io_clock[READ]); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_io_timers_write) { -+ bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_data_op_data_progress) { -+ data_progress_to_text(&out, c); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_data_jobs) ++ data_progress_to_text(out, c); + + return 0; +} @@ -70479,9 +73739,6 @@ index 000000000000..864be8601868 +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + -+ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); -+ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); -+ + if (attr == &sysfs_btree_gc_periodic) { + ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) + ?: (ssize_t) size; @@ -70518,8 +73775,16 @@ index 000000000000..864be8601868 + + /* Debugging: */ + -+ if (attr == &sysfs_trigger_journal_flush) -+ bch2_journal_meta(&c->journal); ++ if (!test_bit(BCH_FS_RW, &c->flags)) ++ return -EROFS; ++ ++ if (attr == &sysfs_prune_cache) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = strtoul_or_return(buf); ++ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); ++ } + + if (attr == &sysfs_trigger_gc) { + /* @@ -70534,14 +73799,6 @@ index 000000000000..864be8601868 +#endif + } + -+ if (attr == &sysfs_prune_cache) { -+ struct shrink_control sc; -+ -+ sc.gfp_mask = GFP_KERNEL; -+ sc.nr_to_scan = strtoul_or_return(buf); -+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); -+ } -+ +#ifdef CONFIG_BCACHEFS_TESTS + if (attr == &sysfs_perf_test) { + char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -70568,14 +73825,9 @@ index 000000000000..864be8601868 + +struct attribute *bch2_fs_files[] = { + &sysfs_minor, -+ &sysfs_block_size, -+ &sysfs_btree_node_size, + &sysfs_btree_cache_size, + &sysfs_btree_avg_write_size, + -+ &sysfs_journal_write_delay_ms, -+ &sysfs_journal_reclaim_delay_ms, -+ + &sysfs_promote_whole_extents, + + &sysfs_compression_stats, @@ -70591,7 +73843,7 @@ index 000000000000..864be8601868 +SHOW(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ return bch2_fs_show(&c->kobj, attr, buf); ++ return bch2_fs_to_text(out, &c->kobj, attr); +} + +STORE(bch2_fs_internal) @@ -70602,25 +73854,26 @@ index 000000000000..864be8601868 +SYSFS_OPS(bch2_fs_internal); + +struct attribute *bch2_fs_internal_files[] = { -+ &sysfs_alloc_debug, + &sysfs_journal_debug, -+ &sysfs_journal_pins, + &sysfs_btree_updates, -+ &sysfs_dirty_btree_nodes, + &sysfs_btree_cache, + &sysfs_btree_key_cache, + &sysfs_btree_transactions, ++ &sysfs_new_stripes, + &sysfs_stripes_heap, + &sysfs_open_buckets, ++ &sysfs_io_timers_read, ++ &sysfs_io_timers_write, ++ ++ &sysfs_trigger_gc, ++ &sysfs_prune_cache, + + &sysfs_read_realloc_races, + &sysfs_extent_migrate_done, + &sysfs_extent_migrate_raced, ++ &sysfs_bucket_alloc_fail, + -+ &sysfs_trigger_journal_flush, -+ &sysfs_trigger_gc, + &sysfs_gc_gens_pos, -+ &sysfs_prune_cache, + + &sysfs_copy_gc_enabled, + &sysfs_copy_gc_wait, @@ -70629,12 +73882,7 @@ index 000000000000..864be8601868 + &sysfs_rebalance_work, + sysfs_pd_controller_files(rebalance), + -+ &sysfs_new_stripes, -+ -+ &sysfs_io_timers_read, -+ &sysfs_io_timers_write, -+ -+ &sysfs_data_op_data_progress, ++ &sysfs_data_jobs, + + &sysfs_internal_uuid, + NULL @@ -70644,47 +73892,49 @@ index 000000000000..864be8601868 + +SHOW(bch2_fs_opts_dir) +{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt = container_of(attr, struct bch_option, attr); + int id = opt - bch2_opt_table; + u64 v = bch2_opt_get_by_id(&c->opts, id); + -+ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); -+ pr_buf(&out, "\n"); ++ bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); ++ pr_char(out, '\n'); + -+ return out.pos - buf; ++ return 0; +} + +STORE(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int ret, id = opt - bch2_opt_table; ++ int ret = size, id = opt - bch2_opt_table; + char *tmp; + u64 v; + -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; ++ /* ++ * We don't need to take c->writes for correctness, but it eliminates an ++ * unsightly error message in the dmesg log when we're RO: ++ */ ++ if (unlikely(!percpu_ref_tryget(&c->writes))) ++ return -EROFS; + -+ ret = bch2_opt_parse(c, opt, strim(tmp), &v); ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); + kfree(tmp); + + if (ret < 0) -+ return ret; ++ goto err; + + ret = bch2_opt_check_may_set(c, id, v); + if (ret < 0) -+ return ret; -+ -+ if (opt->set_sb != SET_NO_SB_OPT) { -+ mutex_lock(&c->sb_lock); -+ opt->set_sb(c->disk_sb.sb, v); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } ++ goto err; + ++ bch2_opt_set_sb(c, opt, v); + bch2_opt_set_by_id(&c->opts, id, v); + + if ((id == Opt_background_target || @@ -70692,8 +73942,9 @@ index 000000000000..864be8601868 + bch2_rebalance_add_work(c, S64_MAX); + rebalance_wakeup(c); + } -+ -+ return size; ++err: ++ percpu_ref_put(&c->writes); ++ return ret; +} +SYSFS_OPS(bch2_fs_opts_dir); + @@ -70707,7 +73958,7 @@ index 000000000000..864be8601868 + for (i = bch2_opt_table; + i < bch2_opt_table + bch2_opts_nr; + i++) { -+ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) ++ if (!(i->flags & OPT_FS)) + continue; + + ret = sysfs_create_file(kobj, &i->attr); @@ -70723,13 +73974,10 @@ index 000000000000..864be8601868 +SHOW(bch2_fs_time_stats) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); + +#define x(name) \ -+ if (attr == &sysfs_time_stat_##name) { \ -+ bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ -+ return out.pos - buf; \ -+ } ++ if (attr == &sysfs_time_stat_##name) \ ++ bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); + BCH_TIME_STATS() +#undef x + @@ -70750,24 +73998,6 @@ index 000000000000..864be8601868 + NULL +}; + -+static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ enum alloc_reserve i; -+ -+ spin_lock(&ca->fs->freelist_lock); -+ -+ pr_buf(out, "free_inc:\t%zu\t%zu\n", -+ fifo_used(&ca->free_inc), -+ ca->free_inc.size); -+ -+ for (i = 0; i < RESERVE_NR; i++) -+ pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, -+ fifo_used(&ca->free[i]), -+ ca->free[i].size); -+ -+ spin_unlock(&ca->fs->freelist_lock); -+} -+ +static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; @@ -70777,7 +74007,7 @@ index 000000000000..864be8601868 + memset(nr, 0, sizeof(nr)); + + for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) -+ nr[c->open_buckets[i].type]++; ++ nr[c->open_buckets[i].data_type]++; + + pr_buf(out, + "\t\t buckets\t sectors fragmented\n" @@ -70793,9 +74023,6 @@ index 000000000000..864be8601868 + "ec\t%16llu\n" + "available%15llu\n" + "\n" -+ "free_inc\t\t%zu/%zu\n" -+ "free[RESERVE_MOVINGGC]\t%zu/%zu\n" -+ "free[RESERVE_NONE]\t%zu/%zu\n" + "freelist_wait\t\t%s\n" + "open buckets allocated\t%u\n" + "open buckets this dev\t%u\n" @@ -70803,13 +74030,9 @@ index 000000000000..864be8601868 + "open_buckets_wait\t%s\n" + "open_buckets_btree\t%u\n" + "open_buckets_user\t%u\n" -+ "btree reserve cache\t%u\n" -+ "thread state:\t\t%s\n", ++ "btree reserve cache\t%u\n", + stats.buckets_ec, -+ __dev_buckets_available(ca, stats), -+ fifo_used(&ca->free_inc), ca->free_inc.size, -+ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, -+ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ++ __dev_buckets_available(ca, stats, RESERVE_none), + c->freelist_wait.list.first ? "waiting" : "empty", + OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, + ca->nr_open_buckets, @@ -70817,8 +74040,7 @@ index 000000000000..864be8601868 + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_btree], + nr[BCH_DATA_user], -+ c->btree_reserve_cache_nr, -+ bch2_allocator_states[ca->allocator_state]); ++ c->btree_reserve_cache_nr); +} + +static const char * const bch2_rw[] = { @@ -70845,12 +74067,10 @@ index 000000000000..864be8601868 +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + struct bch_fs *c = ca->fs; -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); + + sysfs_printf(uuid, "%pU\n", ca->uuid.b); + + sysfs_print(bucket_size, bucket_bytes(ca)); -+ sysfs_print(block_size, block_bytes(c)); + sysfs_print(first_bucket, ca->mi.first_bucket); + sysfs_print(nbuckets, ca->mi.nbuckets); + sysfs_print(durability, ca->mi.durability); @@ -70859,66 +74079,44 @@ index 000000000000..864be8601868 + if (attr == &sysfs_label) { + if (ca->mi.group) { + mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(&out, &c->disk_sb, ++ bch2_disk_path_to_text(out, c->disk_sb.sb, + ca->mi.group - 1); + mutex_unlock(&c->sb_lock); + } + -+ pr_buf(&out, "\n"); -+ return out.pos - buf; ++ pr_char(out, '\n'); + } + + if (attr == &sysfs_has_data) { -+ bch2_flags_to_text(&out, bch2_data_types, ++ bch2_flags_to_text(out, bch2_data_types, + bch2_dev_has_data(c, ca)); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_cache_replacement_policy) { -+ bch2_string_opt_to_text(&out, -+ bch2_cache_replacement_policies, -+ ca->mi.replacement); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; ++ pr_char(out, '\n'); + } + + if (attr == &sysfs_state_rw) { -+ bch2_string_opt_to_text(&out, bch2_member_states, ++ bch2_string_opt_to_text(out, bch2_member_states, + ca->mi.state); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; ++ pr_char(out, '\n'); + } + -+ if (attr == &sysfs_iodone) { -+ dev_iodone_to_text(&out, ca); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_iodone) ++ dev_iodone_to_text(out, ca); + + sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); + sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); + -+ if (attr == &sysfs_io_latency_stats_read) { -+ bch2_time_stats_to_text(&out, &ca->io_latency[READ]); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_io_latency_stats_write) { -+ bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_io_latency_stats_read) ++ bch2_time_stats_to_text(out, &ca->io_latency[READ]); ++ ++ if (attr == &sysfs_io_latency_stats_write) ++ bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + + sysfs_printf(congested, "%u%%", + clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) + * 100 / CONGESTED_MAX); + -+ if (attr == &sysfs_reserve_stats) { -+ reserve_stats_to_text(&out, ca); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_alloc_debug) { -+ dev_alloc_debug_to_text(&out, ca); -+ return out.pos - buf; -+ } ++ if (attr == &sysfs_alloc_debug) ++ dev_alloc_debug_to_text(out, ca); + + return 0; +} @@ -70942,22 +74140,6 @@ index 000000000000..864be8601868 + mutex_unlock(&c->sb_lock); + } + -+ if (attr == &sysfs_cache_replacement_policy) { -+ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); -+ -+ if (v < 0) -+ return v; -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { -+ SET_BCH_MEMBER_REPLACEMENT(mi, v); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ + if (attr == &sysfs_label) { + char *tmp; + int ret; @@ -70972,9 +74154,6 @@ index 000000000000..864be8601868 + return ret; + } + -+ if (attr == &sysfs_wake_allocator) -+ bch2_wake_allocator(ca); -+ + return size; +} +SYSFS_OPS(bch2_dev); @@ -70982,14 +74161,12 @@ index 000000000000..864be8601868 +struct attribute *bch2_dev_files[] = { + &sysfs_uuid, + &sysfs_bucket_size, -+ &sysfs_block_size, + &sysfs_first_bucket, + &sysfs_nbuckets, + &sysfs_durability, + + /* settings: */ + &sysfs_discard, -+ &sysfs_cache_replacement_policy, + &sysfs_state_rw, + &sysfs_label, + @@ -71002,11 +74179,8 @@ index 000000000000..864be8601868 + &sysfs_io_latency_stats_write, + &sysfs_congested, + -+ &sysfs_reserve_stats, -+ + /* debug: */ + &sysfs_alloc_debug, -+ &sysfs_wake_allocator, + NULL +}; + @@ -71063,16 +74237,17 @@ index 000000000000..525fd05d91f7 +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c new file mode 100644 -index 000000000000..d5a74f4db64d +index 000000000000..4369bfc55a94 --- /dev/null +++ b/fs/bcachefs/tests.c -@@ -0,0 +1,871 @@ +@@ -0,0 +1,947 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_TESTS + +#include "bcachefs.h" +#include "btree_update.h" +#include "journal_reclaim.h" ++#include "subvolume.h" +#include "tests.h" + +#include "linux/kthread.h" @@ -71083,13 +74258,14 @@ index 000000000000..d5a74f4db64d + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_extents, -+ POS(0, 0), POS(0, U64_MAX), ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, + NULL); + BUG_ON(ret); + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, NULL); + BUG_ON(ret); +} + @@ -71213,7 +74389,7 @@ index 000000000000..d5a74f4db64d + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, -+ POS_MIN, 0, k, ret) { ++ SPOS(0, 0, U32_MAX), 0, k, ret) { + if (k.k->p.inode) + break; + @@ -71269,7 +74445,7 @@ index 000000000000..d5a74f4db64d + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_extents, -+ POS_MIN, 0, k, ret) { ++ SPOS(0, 0, U32_MAX), 0, k, ret) { + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + } @@ -71323,8 +74499,8 @@ index 000000000000..d5a74f4db64d + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, -+ 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ret) { + if (k.k->p.inode) + break; + @@ -71339,7 +74515,8 @@ index 000000000000..d5a74f4db64d + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS, k, ret) { + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); @@ -71388,8 +74565,8 @@ index 000000000000..d5a74f4db64d + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, -+ 0, k, ret) { ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0, k, ret) { + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; @@ -71402,7 +74579,8 @@ index 000000000000..d5a74f4db64d + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS, k, ret) { + BUG_ON(bkey_deleted(k.k) != !(i % 16)); + @@ -71430,7 +74608,8 @@ index 000000000000..d5a74f4db64d + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); + + k = bch2_btree_iter_peek(&iter); + BUG_ON(k.k); @@ -71450,7 +74629,8 @@ index 000000000000..d5a74f4db64d + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0); + + k = bch2_btree_iter_peek(&iter); + BUG_ON(k.k); @@ -71473,8 +74653,6 @@ index 000000000000..d5a74f4db64d + struct bkey_i_cookie k; + int ret; + -+ //pr_info("inserting %llu-%llu v %llu", start, end, test_version); -+ + bkey_cookie_init(&k.k_i); + k.k_i.k.p.offset = end; + k.k_i.k.p.snapshot = U32_MAX; @@ -71526,6 +74704,70 @@ index 000000000000..d5a74f4db64d + __test_extent_overwrite(c, 32, 64, 32, 128); +} + ++/* snapshot unit tests */ ++ ++/* Test skipping over keys in unrelated snapshots: */ ++static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie cookie; ++ int ret; ++ ++ bkey_cookie_init(&cookie.k_i); ++ cookie.k.p.snapshot = snapid_hi; ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, snapid_lo), 0); ++ k = bch2_btree_iter_peek(&iter); ++ ++ BUG_ON(k.k->p.snapshot != U32_MAX); ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int test_snapshots(struct bch_fs *c, u64 nr) ++{ ++ struct bkey_i_cookie cookie; ++ u32 snapids[2]; ++ u32 snapid_subvols[2] = { 1, 1 }; ++ int ret; ++ ++ bkey_cookie_init(&cookie.k_i); ++ cookie.k.p.snapshot = U32_MAX; ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_snapshot_node_create(&trans, U32_MAX, ++ snapids, ++ snapid_subvols, ++ 2)); ++ if (ret) ++ return ret; ++ ++ if (snapids[0] > snapids[1]) ++ swap(snapids[0], snapids[1]); ++ ++ ret = test_snapshot_filter(c, snapids[0], snapids[1]); ++ if (ret) { ++ bch_err(c, "err %i from test_snapshot_filter", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ +/* perf tests */ + +static u64 test_rand(void) @@ -71610,10 +74852,11 @@ index 000000000000..d5a74f4db64d + u64 i; + + bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); + + for (i = 0; i < nr; i++) { -+ bch2_btree_iter_set_pos(&iter, POS(0, test_rand())); ++ bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); @@ -71636,7 +74879,7 @@ index 000000000000..d5a74f4db64d + struct bkey_s_c k; + int ret; + -+ bch2_btree_iter_set_pos(iter, POS(0, pos)); ++ bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); @@ -71648,10 +74891,10 @@ index 000000000000..d5a74f4db64d + if (!(i & 3) && k.k) { + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter->pos; -+ bch2_trans_update(trans, iter, &cookie->k_i, 0); ++ ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); + } + -+ return 0; ++ return ret; +} + +static int rand_mixed(struct bch_fs *c, u64 nr) @@ -71663,7 +74906,8 @@ index 000000000000..d5a74f4db64d + u64 i, rand; + + bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); + + for (i = 0; i < nr; i++) { + rand = test_rand(); @@ -71683,7 +74927,6 @@ index 000000000000..d5a74f4db64d +static int __do_delete(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; -+ struct bkey_i delete; + struct bkey_s_c k; + int ret = 0; + @@ -71697,10 +74940,7 @@ index 000000000000..d5a74f4db64d + if (!k.k) + goto err; + -+ bkey_init(&delete.k); -+ delete.k.p = k.k->p; -+ -+ ret = bch2_trans_update(trans, &iter, &delete, 0); ++ ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; @@ -71715,7 +74955,7 @@ index 000000000000..d5a74f4db64d + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i++) { -+ struct bpos pos = POS(0, test_rand()); ++ struct bpos pos = SPOS(0, test_rand(), U32_MAX); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __do_delete(&trans, pos)); @@ -71742,7 +74982,7 @@ index 000000000000..d5a74f4db64d + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + insert.k.p = iter.pos; + @@ -71772,7 +75012,8 @@ index 000000000000..d5a74f4db64d + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ret) + ; + bch2_trans_iter_exit(&trans, &iter); + @@ -71789,7 +75030,8 @@ index 000000000000..d5a74f4db64d + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, ++ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), + BTREE_ITER_INTENT, k, ret) { + struct bkey_i_cookie u; + @@ -71814,8 +75056,8 @@ index 000000000000..d5a74f4db64d + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, NULL); + if (ret) + bch_err(c, "error in seq_delete: %i", ret); + return ret; @@ -71853,8 +75095,10 @@ index 000000000000..d5a74f4db64d + } + + ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); -+ if (ret) ++ if (ret) { ++ bch_err(j->c, "%ps: error %i", j->fn, ret); + j->ret = ret; ++ } + + if (atomic_dec_and_test(&j->done)) { + j->finish = sched_clock(); @@ -71868,7 +75112,9 @@ index 000000000000..d5a74f4db64d + u64 nr, unsigned nr_threads) +{ + struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; -+ char name_buf[20], nr_buf[20], per_sec_buf[20]; ++ char name_buf[20]; ++ struct printbuf nr_buf = PRINTBUF; ++ struct printbuf per_sec_buf = PRINTBUF; + unsigned i; + u64 time; + @@ -71907,6 +75153,8 @@ index 000000000000..d5a74f4db64d + perf_test(test_extent_overwrite_middle); + perf_test(test_extent_overwrite_all); + ++ perf_test(test_snapshots); ++ + if (!j.fn) { + pr_err("unknown test %s", testname); + return -EINVAL; @@ -71927,13 +75175,15 @@ index 000000000000..d5a74f4db64d + time = j.finish - j.start; + + scnprintf(name_buf, sizeof(name_buf), "%s:", testname); -+ bch2_hprint(&PBUF(nr_buf), nr); -+ bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time)); ++ bch2_hprint(&nr_buf, nr); ++ bch2_hprint(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); + printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", -+ name_buf, nr_buf, nr_threads, ++ name_buf, nr_buf.buf, nr_threads, + div_u64(time, NSEC_PER_SEC), + div_u64(time * nr_threads, nr), -+ per_sec_buf); ++ per_sec_buf.buf); ++ printbuf_exit(&per_sec_buf); ++ printbuf_exit(&nr_buf); + return j.ret; +} + @@ -71979,10 +75229,10 @@ index 000000000000..59e8dfa3d245 +#include diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 -index 000000000000..52de7c49cacb +index 000000000000..37fc20413764 --- /dev/null +++ b/fs/bcachefs/util.c -@@ -0,0 +1,912 @@ +@@ -0,0 +1,984 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * random utiility code, for bcache but in theory not specific to bcache @@ -72084,6 +75334,71 @@ index 000000000000..52de7c49cacb +STRTO_H(strtoull, unsigned long long) +STRTO_H(strtou64, u64) + ++static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra) ++{ ++ unsigned new_size; ++ char *buf; ++ ++ if (out->pos + extra + 1 < out->size) ++ return 0; ++ ++ new_size = roundup_pow_of_two(out->size + extra); ++ buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC); ++ ++ if (!buf) { ++ out->allocation_failure = true; ++ return -ENOMEM; ++ } ++ ++ out->buf = buf; ++ out->size = new_size; ++ return 0; ++} ++ ++void bch2_pr_buf(struct printbuf *out, const char *fmt, ...) ++{ ++ va_list args; ++ int len; ++ ++ do { ++ va_start(args, fmt); ++ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); ++ va_end(args); ++ } while (len + 1 >= printbuf_remaining(out) && ++ !bch2_printbuf_realloc(out, len + 1)); ++ ++ len = min_t(size_t, len, ++ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); ++ out->pos += len; ++} ++ ++void bch2_pr_tab_rjust(struct printbuf *buf) ++{ ++ BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); ++ ++ if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) { ++ unsigned move = buf->pos - buf->last_field; ++ unsigned shift = buf->tabstops[buf->tabstop] - ++ printbuf_linelen(buf); ++ ++ bch2_printbuf_realloc(buf, shift); ++ ++ if (buf->last_field + shift + 1 < buf->size) { ++ move = min(move, buf->size - 1 - buf->last_field - shift); ++ ++ memmove(buf->buf + buf->last_field + shift, ++ buf->buf + buf->last_field, ++ move); ++ memset(buf->buf + buf->last_field, ' ', shift); ++ buf->pos += shift; ++ buf->buf[buf->pos] = 0; ++ } ++ } ++ ++ buf->last_field = buf->pos; ++ buf->tabstop++; ++} ++ +void bch2_hprint(struct printbuf *buf, s64 v) +{ + int u, t = 0; @@ -72099,10 +75414,25 @@ index 000000000000..52de7c49cacb + * 103 is magic: t is in the range [-1023, 1023] and we want + * to turn it into [-9, 9] + */ -+ if (u && v < 100 && v > -100) ++ if (u && t && v < 100 && v > -100) + pr_buf(buf, ".%i", t / 103); + if (u) -+ pr_buf(buf, "%c", si_units[u]); ++ pr_char(buf, si_units[u]); ++} ++ ++void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes) ++{ ++ switch (out->units) { ++ case PRINTBUF_UNITS_RAW: ++ pr_buf(out, "%llu", raw); ++ break; ++ case PRINTBUF_UNITS_BYTES: ++ pr_buf(out, "%llu", bytes); ++ break; ++ case PRINTBUF_UNITS_HUMAN_READABLE: ++ bch2_hprint(out, bytes); ++ break; ++ } +} + +void bch2_string_opt_to_text(struct printbuf *out, @@ -72121,9 +75451,6 @@ index 000000000000..52de7c49cacb + unsigned bit, nr = 0; + bool first = true; + -+ if (out->pos != out->end) -+ *out->pos = '\0'; -+ + while (list[nr]) + nr++; + @@ -72452,36 +75779,44 @@ index 000000000000..52de7c49cacb + pd->backpressure = 1; +} + -+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) ++void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) +{ -+ /* 2^64 - 1 is 20 digits, plus null byte */ -+ char rate[21]; -+ char actual[21]; -+ char target[21]; -+ char proportional[21]; -+ char derivative[21]; -+ char change[21]; -+ s64 next_io; ++ out->tabstops[0] = 20; + -+ bch2_hprint(&PBUF(rate), pd->rate.rate); -+ bch2_hprint(&PBUF(actual), pd->last_actual); -+ bch2_hprint(&PBUF(target), pd->last_target); -+ bch2_hprint(&PBUF(proportional), pd->last_proportional); -+ bch2_hprint(&PBUF(derivative), pd->last_derivative); -+ bch2_hprint(&PBUF(change), pd->last_change); ++ pr_buf(out, "rate:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->rate.rate); ++ pr_newline(out); + -+ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); ++ pr_buf(out, "target:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_target); ++ pr_newline(out); + -+ return sprintf(buf, -+ "rate:\t\t%s/sec\n" -+ "target:\t\t%s\n" -+ "actual:\t\t%s\n" -+ "proportional:\t%s\n" -+ "derivative:\t%s\n" -+ "change:\t\t%s/sec\n" -+ "next io:\t%llims\n", -+ rate, target, actual, proportional, -+ derivative, change, next_io); ++ pr_buf(out, "actual:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_actual); ++ pr_newline(out); ++ ++ pr_buf(out, "proportional:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_proportional); ++ pr_newline(out); ++ ++ pr_buf(out, "derivative:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_derivative); ++ pr_newline(out); ++ ++ pr_buf(out, "change:"); ++ pr_tab(out); ++ bch2_hprint(out, pd->last_change); ++ pr_newline(out); ++ ++ pr_buf(out, "next io:"); ++ pr_tab(out); ++ pr_buf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); ++ pr_newline(out); +} + +/* misc: */ @@ -72564,19 +75899,6 @@ index 000000000000..52de7c49cacb + } +} + -+void bch_scnmemcpy(struct printbuf *out, -+ const char *src, size_t len) -+{ -+ size_t n = printbuf_remaining(out); -+ -+ if (n) { -+ n = min(n - 1, len); -+ memcpy(out->pos, src, n); -+ out->pos += n; -+ *out->pos = '\0'; -+ } -+} -+ +#include "eytzinger.h" + +static int alignment_ok(const void *base, size_t align) @@ -72897,10 +76219,10 @@ index 000000000000..52de7c49cacb +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 -index 000000000000..80402b398442 +index 000000000000..888693703c75 --- /dev/null +++ b/fs/bcachefs/util.h -@@ -0,0 +1,749 @@ +@@ -0,0 +1,877 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H @@ -73113,9 +76435,11 @@ index 000000000000..80402b398442 + \ + BUG_ON(_i >= (h)->used); \ + (h)->used--; \ -+ heap_swap(h, _i, (h)->used, set_backpointer); \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ heap_sift_down(h, _i, cmp, set_backpointer); \ ++ if ((_i) < (h)->used) { \ ++ heap_swap(h, _i, (h)->used, set_backpointer); \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ heap_sift_down(h, _i, cmp, set_backpointer); \ ++ } \ +} while (0) + +#define heap_pop(h, d, cmp, set_backpointer) \ @@ -73138,31 +76462,157 @@ index 000000000000..80402b398442 +#define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) + -+struct printbuf { -+ char *pos; -+ char *end; ++enum printbuf_units { ++ PRINTBUF_UNITS_RAW, ++ PRINTBUF_UNITS_BYTES, ++ PRINTBUF_UNITS_HUMAN_READABLE, +}; + ++struct printbuf { ++ char *buf; ++ unsigned size; ++ unsigned pos; ++ unsigned last_newline; ++ unsigned last_field; ++ unsigned indent; ++ enum printbuf_units units:8; ++ u8 atomic; ++ bool allocation_failure:1; ++ u8 tabstop; ++ u8 tabstops[4]; ++}; ++ ++#define PRINTBUF ((struct printbuf) { NULL }) ++ ++static inline void printbuf_exit(struct printbuf *buf) ++{ ++ kfree(buf->buf); ++ buf->buf = ERR_PTR(-EINTR); /* poison value */ ++} ++ ++static inline void printbuf_reset(struct printbuf *buf) ++{ ++ buf->pos = 0; ++ buf->last_newline = 0; ++ buf->last_field = 0; ++ buf->indent = 0; ++ buf->tabstop = 0; ++} ++ +static inline size_t printbuf_remaining(struct printbuf *buf) +{ -+ return buf->end - buf->pos; ++ return buf->size - buf->pos; +} + -+#define _PBUF(_buf, _len) \ -+ ((struct printbuf) { \ -+ .pos = _buf, \ -+ .end = _buf + _len, \ -+ }) ++static inline size_t printbuf_linelen(struct printbuf *buf) ++{ ++ return buf->pos - buf->last_newline; ++} + -+#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) ++void bch2_pr_buf(struct printbuf *out, const char *fmt, ...) ++ __attribute__ ((format (printf, 2, 3))); + -+#define pr_buf(_out, ...) \ -+do { \ -+ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ -+ __VA_ARGS__); \ -+} while (0) ++#define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__) + -+void bch_scnmemcpy(struct printbuf *, const char *, size_t); ++static inline void pr_char(struct printbuf *out, char c) ++{ ++ bch2_pr_buf(out, "%c", c); ++} ++ ++static inline void pr_indent_push(struct printbuf *buf, unsigned spaces) ++{ ++ buf->indent += spaces; ++ while (spaces--) ++ pr_char(buf, ' '); ++} ++ ++static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces) ++{ ++ if (buf->last_newline + buf->indent == buf->pos) { ++ buf->pos -= spaces; ++ buf->buf[buf->pos] = 0; ++ } ++ buf->indent -= spaces; ++} ++ ++static inline void pr_newline(struct printbuf *buf) ++{ ++ unsigned i; ++ ++ pr_char(buf, '\n'); ++ ++ buf->last_newline = buf->pos; ++ ++ for (i = 0; i < buf->indent; i++) ++ pr_char(buf, ' '); ++ ++ buf->last_field = buf->pos; ++ buf->tabstop = 0; ++} ++ ++static inline void pr_tab(struct printbuf *buf) ++{ ++ BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); ++ ++ while (printbuf_remaining(buf) > 1 && ++ printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) ++ pr_char(buf, ' '); ++ ++ buf->last_field = buf->pos; ++ buf->tabstop++; ++} ++ ++void bch2_pr_tab_rjust(struct printbuf *); ++ ++static inline void pr_tab_rjust(struct printbuf *buf) ++{ ++ bch2_pr_tab_rjust(buf); ++} ++ ++void bch2_pr_units(struct printbuf *, s64, s64); ++#define pr_units(...) bch2_pr_units(__VA_ARGS__) ++ ++static inline void pr_sectors(struct printbuf *out, u64 v) ++{ ++ bch2_pr_units(out, v, v << 9); ++} ++ ++#ifdef __KERNEL__ ++static inline void pr_time(struct printbuf *out, u64 time) ++{ ++ pr_buf(out, "%llu", time); ++} ++#else ++#include ++static inline void pr_time(struct printbuf *out, u64 _time) ++{ ++ char time_str[64]; ++ time_t time = _time; ++ struct tm *tm = localtime(&time); ++ size_t err = strftime(time_str, sizeof(time_str), "%c", tm); ++ if (!err) ++ pr_buf(out, "(formatting error)"); ++ else ++ pr_buf(out, "%s", time_str); ++} ++#endif ++ ++#ifdef __KERNEL__ ++static inline void uuid_unparse_lower(u8 *uuid, char *out) ++{ ++ sprintf(out, "%pUb", uuid); ++} ++#else ++#include ++#endif ++ ++static inline void pr_uuid(struct printbuf *out, u8 *uuid) ++{ ++ char uuid_str[40]; ++ ++ uuid_unparse_lower(uuid, uuid_str); ++ pr_buf(out, uuid_str); ++} + +int bch2_strtoint_h(const char *, int *); +int bch2_strtouint_h(const char *, unsigned int *); @@ -73226,8 +76676,8 @@ index 000000000000..80402b398442 + _r; \ +}) + -+#define snprint(buf, size, var) \ -+ snprintf(buf, size, \ ++#define snprint(out, var) \ ++ pr_buf(out, \ + type_is(var, int) ? "%i\n" \ + : type_is(var, unsigned) ? "%u\n" \ + : type_is(var, long) ? "%li\n" \ @@ -73344,7 +76794,7 @@ index 000000000000..80402b398442 + +void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); +void bch2_pd_controller_init(struct bch_pd_controller *); -+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); ++void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); + +#define sysfs_pd_controller_attribute(name) \ + rw_attribute(name##_rate); \ @@ -73368,7 +76818,7 @@ index 000000000000..80402b398442 + sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ + \ + if (attr == &sysfs_##name##_rate_debug) \ -+ return bch2_pd_controller_print_debug(var, buf); \ ++ bch2_pd_controller_debug_to_text(out, var); \ +} while (0) + +#define sysfs_pd_controller_store(name, var) \ @@ -73795,7 +77245,7 @@ index 000000000000..92a182fb3d7a +#endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h new file mode 100644 -index 000000000000..c099cdc0605f +index 000000000000..53a694d71967 --- /dev/null +++ b/fs/bcachefs/vstructs.h @@ -0,0 +1,63 @@ @@ -73821,7 +77271,7 @@ index 000000000000..c099cdc0605f +({ \ + BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ + \ -+ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ ++ (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ +}) + +#define vstruct_bytes(_s) \ @@ -73864,7 +77314,7 @@ index 000000000000..c099cdc0605f +#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c new file mode 100644 -index 000000000000..464ed68318e7 +index 000000000000..8d23b4c2449e --- /dev/null +++ b/fs/bcachefs/xattr.c @@ -0,0 +1,629 @@ @@ -73981,11 +77431,11 @@ index 000000000000..464ed68318e7 + else + pr_buf(out, "(unknown type %u)", xattr.v->x_type); + -+ bch_scnmemcpy(out, xattr.v->x_name, -+ xattr.v->x_name_len); -+ pr_buf(out, ":"); -+ bch_scnmemcpy(out, xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); ++ pr_buf(out, "%.*s:%.*s", ++ xattr.v->x_name_len, ++ xattr.v->x_name, ++ le16_to_cpu(xattr.v->x_val_len), ++ (char *) xattr_val(xattr.v)); +} + +static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, @@ -74181,13 +77631,9 @@ index 000000000000..464ed68318e7 + if (ret) + goto err; + -+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs, -+ SPOS(inum, offset, snapshot), 0, k, ret) { -+ BUG_ON(k.k->p.inode < inum); -+ -+ if (k.k->p.inode > inum) -+ break; -+ ++ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, ++ SPOS(inum, offset, snapshot), ++ POS(inum, U64_MAX), 0, k, ret) { + if (k.k->type != KEY_TYPE_xattr) + continue; + @@ -74296,9 +77742,8 @@ index 000000000000..464ed68318e7 + bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); + const struct bch_option *opt; + int id, inode_opt_id; -+ char buf[512]; -+ struct printbuf out = PBUF(buf); -+ unsigned val_len; ++ struct printbuf out = PRINTBUF; ++ int ret; + u64 v; + + id = bch2_opt_lookup(name); @@ -74319,16 +77764,21 @@ index 000000000000..464ed68318e7 + return -ENODATA; + + v = bch2_opt_get_by_id(&opts, id); -+ bch2_opt_to_text(&out, c, opt, v, 0); ++ bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); + -+ val_len = out.pos - buf; ++ ret = out.pos; + -+ if (buffer && val_len > size) -+ return -ERANGE; ++ if (out.allocation_failure) { ++ ret = -ENOMEM; ++ } else if (buffer) { ++ if (out.pos > size) ++ ret = -ERANGE; ++ else ++ memcpy(buffer, out.buf, out.pos); ++ } + -+ if (buffer) -+ memcpy(buffer, buf, val_len); -+ return val_len; ++ printbuf_exit(&out); ++ return ret; +} + +static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, @@ -74395,7 +77845,7 @@ index 000000000000..464ed68318e7 + memcpy(buf, value, size); + buf[size] = '\0'; + -+ ret = bch2_opt_parse(c, opt, buf, &v); ++ ret = bch2_opt_parse(c, opt, buf, &v, NULL); + kfree(buf); + + if (ret < 0) @@ -74583,7 +78033,7 @@ index cf871a81f4fd..30910dae37ad 100644 } EXPORT_SYMBOL(d_tmpfile); diff --git a/fs/inode.c b/fs/inode.c -index ed0cab8a32db..900927eab51c 100644 +index 8279c700a2b7..f6aa9ec4382b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -56,8 +56,23 @@ @@ -74612,7 +78062,7 @@ index ed0cab8a32db..900927eab51c 100644 /* * Empty aops. Can be used for the cases where the user does not -@@ -394,7 +409,7 @@ EXPORT_SYMBOL(address_space_init_once); +@@ -393,7 +408,7 @@ EXPORT_SYMBOL(address_space_init_once); void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); @@ -74621,7 +78071,7 @@ index ed0cab8a32db..900927eab51c 100644 INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); -@@ -478,14 +493,15 @@ static inline void inode_sb_list_del(struct inode *inode) +@@ -477,14 +492,15 @@ static inline void inode_sb_list_del(struct inode *inode) } } @@ -74644,7 +78094,7 @@ index ed0cab8a32db..900927eab51c 100644 } /** -@@ -498,13 +514,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) +@@ -497,13 +513,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { @@ -74662,7 +78112,7 @@ index ed0cab8a32db..900927eab51c 100644 } EXPORT_SYMBOL(__insert_inode_hash); -@@ -516,11 +532,44 @@ EXPORT_SYMBOL(__insert_inode_hash); +@@ -515,11 +531,44 @@ EXPORT_SYMBOL(__insert_inode_hash); */ void __remove_inode_hash(struct inode *inode) { @@ -74712,7 +78162,7 @@ index ed0cab8a32db..900927eab51c 100644 } EXPORT_SYMBOL(__remove_inode_hash); -@@ -817,26 +866,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) +@@ -816,26 +865,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } @@ -74745,7 +78195,7 @@ index ed0cab8a32db..900927eab51c 100644 goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { -@@ -855,19 +906,20 @@ static struct inode *find_inode(struct super_block *sb, +@@ -854,19 +905,20 @@ static struct inode *find_inode(struct super_block *sb, * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, @@ -74769,7 +78219,7 @@ index ed0cab8a32db..900927eab51c 100644 goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { -@@ -1076,26 +1128,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories); +@@ -1075,26 +1127,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * @@ -74802,7 +78252,7 @@ index ed0cab8a32db..900927eab51c 100644 if (IS_ERR(old)) return NULL; wait_on_inode(old); -@@ -1117,12 +1169,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, +@@ -1116,12 +1168,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; @@ -74817,7 +78267,7 @@ index ed0cab8a32db..900927eab51c 100644 return inode; } -@@ -1183,12 +1235,12 @@ EXPORT_SYMBOL(iget5_locked); +@@ -1182,12 +1234,12 @@ EXPORT_SYMBOL(iget5_locked); */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { @@ -74834,7 +78284,7 @@ index ed0cab8a32db..900927eab51c 100644 if (inode) { if (IS_ERR(inode)) return NULL; -@@ -1204,17 +1256,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) +@@ -1203,17 +1255,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) if (inode) { struct inode *old; @@ -74856,7 +78306,7 @@ index ed0cab8a32db..900927eab51c 100644 /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents -@@ -1227,7 +1279,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) +@@ -1226,7 +1278,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) * us. Use the old inode instead of the one we just * allocated. */ @@ -74865,7 +78315,7 @@ index ed0cab8a32db..900927eab51c 100644 destroy_inode(inode); if (IS_ERR(old)) return NULL; -@@ -1251,10 +1303,11 @@ EXPORT_SYMBOL(iget_locked); +@@ -1250,10 +1302,11 @@ EXPORT_SYMBOL(iget_locked); */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { @@ -74879,7 +78329,7 @@ index ed0cab8a32db..900927eab51c 100644 if (inode->i_ino == ino && inode->i_sb == sb) return 0; } -@@ -1338,12 +1391,12 @@ EXPORT_SYMBOL(igrab); +@@ -1337,12 +1390,12 @@ EXPORT_SYMBOL(igrab); struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { @@ -74896,7 +78346,7 @@ index ed0cab8a32db..900927eab51c 100644 return IS_ERR(inode) ? NULL : inode; } -@@ -1393,12 +1446,12 @@ EXPORT_SYMBOL(ilookup5); +@@ -1392,12 +1445,12 @@ EXPORT_SYMBOL(ilookup5); */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { @@ -74913,7 +78363,7 @@ index ed0cab8a32db..900927eab51c 100644 if (inode) { if (IS_ERR(inode)) -@@ -1442,12 +1495,13 @@ struct inode *find_inode_nowait(struct super_block *sb, +@@ -1441,12 +1494,13 @@ struct inode *find_inode_nowait(struct super_block *sb, void *), void *data) { @@ -74930,7 +78380,7 @@ index ed0cab8a32db..900927eab51c 100644 if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); -@@ -1458,7 +1512,7 @@ struct inode *find_inode_nowait(struct super_block *sb, +@@ -1457,7 +1511,7 @@ struct inode *find_inode_nowait(struct super_block *sb, goto out; } out: @@ -74939,7 +78389,7 @@ index ed0cab8a32db..900927eab51c 100644 return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); -@@ -1487,13 +1541,14 @@ EXPORT_SYMBOL(find_inode_nowait); +@@ -1486,13 +1540,14 @@ EXPORT_SYMBOL(find_inode_nowait); struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { @@ -74956,7 +78406,7 @@ index ed0cab8a32db..900927eab51c 100644 if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) -@@ -1525,13 +1580,14 @@ EXPORT_SYMBOL(find_inode_rcu); +@@ -1524,13 +1579,14 @@ EXPORT_SYMBOL(find_inode_rcu); struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { @@ -74973,7 +78423,7 @@ index ed0cab8a32db..900927eab51c 100644 if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) -@@ -1545,39 +1601,42 @@ int insert_inode_locked(struct inode *inode) +@@ -1544,39 +1600,42 @@ int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; @@ -75090,10 +78540,10 @@ index 00952e92eae1..ae18dabd3fe7 100644 extern const char *bio_devname(struct bio *bio, char *buffer); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 12b9dbcc980e..2f5c517209b1 100644 +index 67344dfe07a7..0447a4213315 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h -@@ -891,6 +891,7 @@ extern const char *blk_op_str(unsigned int op); +@@ -897,6 +897,7 @@ extern const char *blk_op_str(unsigned int op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); @@ -75213,10 +78663,10 @@ index c88cdc4ae4ec..36b4a83f9b77 100644 + #endif /* _LINUX_CLOSURE_H */ diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h -index e6ec63403965..4e9c50282193 100644 +index 3de06a8fae73..88142e91593a 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h -@@ -305,4 +305,9 @@ +@@ -315,4 +315,9 @@ */ #define __weak __attribute__((__weak__)) @@ -75256,10 +78706,10 @@ index 3260fe714846..bac82bd72626 100644 * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) diff --git a/include/linux/fs.h b/include/linux/fs.h -index e7a633353fd2..08f3c4ab0828 100644 +index fd4c450dc612..19c3c0d8c718 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h -@@ -675,7 +675,8 @@ struct inode { +@@ -676,7 +676,8 @@ struct inode { unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; @@ -75269,7 +78719,7 @@ index e7a633353fd2..08f3c4ab0828 100644 struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ -@@ -741,7 +742,7 @@ static inline unsigned int i_blocksize(const struct inode *node) +@@ -742,7 +743,7 @@ static inline unsigned int i_blocksize(const struct inode *node) static inline int inode_unhashed(struct inode *inode) { @@ -75278,7 +78728,7 @@ index e7a633353fd2..08f3c4ab0828 100644 } /* -@@ -752,7 +753,7 @@ static inline int inode_unhashed(struct inode *inode) +@@ -753,7 +754,7 @@ static inline int inode_unhashed(struct inode *inode) */ static inline void inode_fake_hash(struct inode *inode) { @@ -75287,7 +78737,7 @@ index e7a633353fd2..08f3c4ab0828 100644 } /* -@@ -3187,7 +3188,7 @@ static inline void insert_inode_hash(struct inode *inode) +@@ -3129,7 +3130,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { @@ -75347,10 +78797,10 @@ index ae1b541446c9..8ee2bf5af131 100644 { bit_spin_lock(0, (unsigned long *)b); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h -index 9fe165beb0f9..1113f6ed0eb4 100644 +index aa0ecfc6cdb4..65b6d7da4345 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h -@@ -336,6 +336,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); +@@ -340,6 +340,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) @@ -75359,7 +78809,7 @@ index 9fe165beb0f9..1113f6ed0eb4 100644 #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) -@@ -423,6 +425,8 @@ extern int lockdep_is_held(const void *); +@@ -427,6 +429,8 @@ extern int lockdep_is_held(const void *); #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) @@ -75369,7 +78819,7 @@ index 9fe165beb0f9..1113f6ed0eb4 100644 enum xhlock_context_t { diff --git a/include/linux/sched.h b/include/linux/sched.h -index c1a927ddec64..10a353e3313f 100644 +index e418935f8db6..6c7427c7d547 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -856,6 +856,7 @@ struct task_struct { @@ -75590,10 +79040,10 @@ index 000000000000..477c33eb00d7 + +#endif /* _LINUX_SIX_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h -index 671d402c3778..94417aac15c7 100644 +index 5535be1012a2..8ffb67b9e118 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h -@@ -141,6 +141,7 @@ extern void *vzalloc(unsigned long size); +@@ -148,6 +148,7 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); @@ -75603,10 +79053,10 @@ index 671d402c3778..94417aac15c7 100644 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h new file mode 100644 -index 000000000000..fce3146378f9 +index 000000000000..f63a7c87265d --- /dev/null +++ b/include/trace/events/bcachefs.h -@@ -0,0 +1,800 @@ +@@ -0,0 +1,1034 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcachefs @@ -75791,6 +79241,40 @@ index 000000000000..fce3146378f9 + __entry->nr_flushed) +); + ++/* allocator: */ ++ ++TRACE_EVENT(do_discards, ++ TP_PROTO(struct bch_fs *c, u64 seen, u64 open, ++ u64 need_journal_commit, u64 discarded, int ret), ++ TP_ARGS(c, seen, open, need_journal_commit, discarded, ret), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u64, seen ) ++ __field(u64, open ) ++ __field(u64, need_journal_commit ) ++ __field(u64, discarded ) ++ __field(int, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->seen = seen; ++ __entry->open = open; ++ __entry->need_journal_commit = need_journal_commit; ++ __entry->discarded = discarded; ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->seen, ++ __entry->open, ++ __entry->need_journal_commit, ++ __entry->discarded, ++ __entry->ret) ++); ++ +/* bset.c: */ + +DEFINE_EVENT(bpos, bkey_pack_pos_fail, @@ -75927,6 +79411,80 @@ index 000000000000..fce3146378f9 + TP_ARGS(c, b) +); + ++TRACE_EVENT(btree_cache_scan, ++ TP_PROTO(unsigned long nr_to_scan_pages, ++ unsigned long nr_to_scan_nodes, ++ unsigned long can_free_nodes, ++ long ret), ++ TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, nr_to_scan_pages ) ++ __field(unsigned long, nr_to_scan_nodes ) ++ __field(unsigned long, can_free_nodes ) ++ __field(long, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->nr_to_scan_pages = nr_to_scan_pages; ++ __entry->nr_to_scan_nodes = nr_to_scan_nodes; ++ __entry->can_free_nodes = can_free_nodes; ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li", ++ __entry->nr_to_scan_pages, ++ __entry->nr_to_scan_nodes, ++ __entry->can_free_nodes, ++ __entry->ret) ++); ++ ++TRACE_EVENT(btree_node_relock_fail, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ unsigned long node, ++ u32 iter_lock_seq, ++ u32 node_lock_seq), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, btree_id ) ++ __field(u64, pos_inode ) ++ __field(u64, pos_offset ) ++ __field(u32, pos_snapshot ) ++ __field(unsigned long, node ) ++ __field(u32, iter_lock_seq ) ++ __field(u32, node_lock_seq ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->pos_inode = pos->inode; ++ __entry->pos_offset = pos->offset; ++ __entry->pos_snapshot = pos->snapshot; ++ __entry->node = node; ++ __entry->iter_lock_seq = iter_lock_seq; ++ __entry->node_lock_seq = node_lock_seq; ++ ), ++ ++ TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ __entry->btree_id, ++ __entry->pos_inode, ++ __entry->pos_offset, ++ __entry->pos_snapshot, ++ __entry->node, ++ __entry->iter_lock_seq, ++ __entry->node_lock_seq) ++); ++ +/* Garbage collection */ + +DEFINE_EVENT(btree_node, btree_gc_rewrite_node, @@ -75968,7 +79526,7 @@ index 000000000000..fce3146378f9 + ), + + TP_fast_assign( -+ __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->dev = ca->dev; + __entry->found = found; + __entry->inc_gen = inc_gen; + __entry->inc_gen_skipped = inc_gen_skipped; @@ -75990,7 +79548,7 @@ index 000000000000..fce3146378f9 + ), + + TP_fast_assign( -+ __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->dev = ca->dev; + __entry->offset = offset, + __entry->sectors = sectors; + ), @@ -76003,37 +79561,79 @@ index 000000000000..fce3146378f9 +); + +DECLARE_EVENT_CLASS(bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve), ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), ++ TP_ARGS(ca, alloc_reserve), + + TP_STRUCT__entry( + __field(dev_t, dev ) -+ __field(enum alloc_reserve, reserve ) ++ __array(char, reserve, 16 ) + ), + + TP_fast_assign( -+ __entry->dev = ca->disk_sb.bdev->bd_dev; -+ __entry->reserve = reserve; ++ __entry->dev = ca->dev; ++ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + ), + -+ TP_printk("%d,%d reserve %d", ++ TP_printk("%d,%d reserve %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->reserve) +); + +DEFINE_EVENT(bucket_alloc, bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), ++ TP_ARGS(ca, alloc_reserve) +); + -+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) ++TRACE_EVENT(bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, ++ u64 avail, ++ u64 seen, ++ u64 open, ++ u64 need_journal_commit, ++ u64 nouse, ++ bool nonblocking, ++ int ret), ++ TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __array(char, reserve, 16 ) ++ __field(u64, avail ) ++ __field(u64, seen ) ++ __field(u64, open ) ++ __field(u64, need_journal_commit ) ++ __field(u64, nouse ) ++ __field(bool, nonblocking ) ++ __field(int, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = ca->dev; ++ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); ++ __entry->avail = avail; ++ __entry->seen = seen; ++ __entry->open = open; ++ __entry->need_journal_commit = need_journal_commit; ++ __entry->nouse = nouse; ++ __entry->nonblocking = nonblocking; ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->reserve, ++ __entry->avail, ++ __entry->seen, ++ __entry->open, ++ __entry->need_journal_commit, ++ __entry->nouse, ++ __entry->nonblocking, ++ __entry->ret) +); + +DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), ++ TP_ARGS(ca, alloc_reserve) +); + +/* Moving IO */ @@ -76127,94 +79727,87 @@ index 000000000000..fce3146378f9 + __entry->wait_amount, __entry->until) +); + -+TRACE_EVENT(transaction_restart_ip, -+ TP_PROTO(unsigned long caller, unsigned long ip), -+ TP_ARGS(caller, ip), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, caller ) -+ __field(unsigned long, ip ) -+ ), -+ -+ TP_fast_assign( -+ __entry->caller = caller; -+ __entry->ip = ip; -+ ), -+ -+ TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip) -+); -+ +DECLARE_EVENT_CLASS(transaction_restart, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), -+ TP_ARGS(trans_ip, caller_ip), ++ TP_ARGS(trans_fn, caller_ip), + + TP_STRUCT__entry( -+ __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( -+ __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + -+ TP_printk("%ps %pS", -+ (void *) __entry->trans_ip, -+ (void *) __entry->caller_ip) ++ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) ++); ++ ++DEFINE_EVENT(transaction_restart, transaction_restart_ip, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), -+ TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), -+ TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), -+ TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), -+ TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), -+ TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_restart, trans_traverse_all, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), -+ TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip), -+ TP_ARGS(trans_ip, caller_ip) ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) +); + +DECLARE_EVENT_CLASS(transaction_restart_iter, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), -+ TP_ARGS(trans_ip, caller_ip, btree_id, pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos), + + TP_STRUCT__entry( -+ __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u64, pos_inode ) @@ -76223,7 +79816,7 @@ index 000000000000..fce3146378f9 + ), + + TP_fast_assign( -+ __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; @@ -76231,8 +79824,8 @@ index 000000000000..fce3146378f9 + __entry->pos_snapshot = pos->snapshot; + ), + -+ TP_printk("%ps %pS btree %u pos %llu:%llu:%u", -+ (void *) __entry->trans_ip, ++ TP_printk("%s %pS btree %u pos %llu:%llu:%u", ++ __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->btree_id, + __entry->pos_inode, @@ -76241,63 +79834,111 @@ index 000000000000..fce3146378f9 +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), -+ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), -+ TP_ARGS(trans_ip, caller_ip, btree_id, pos) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_mark, -+ TP_PROTO(unsigned long trans_ip, -+ unsigned long caller_ip, -+ enum btree_id btree_id, -+ struct bpos *pos), -+ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), -+ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), -+ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), -+ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), -+ TP_ARGS(trans_ip, caller_ip, btree_id, pos) ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +TRACE_EVENT(trans_restart_would_deadlock, -+ TP_PROTO(unsigned long trans_ip, ++ TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + bool in_traverse_all, + unsigned reason, @@ -76307,12 +79948,12 @@ index 000000000000..fce3146378f9 + enum btree_id want_btree_id, + unsigned want_iter_type, + struct bpos *want_pos), -+ TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason, ++ TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason, + have_btree_id, have_iter_type, have_pos, + want_btree_id, want_iter_type, want_pos), + + TP_STRUCT__entry( -+ __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + __field(u8, in_traverse_all ) + __field(u8, reason ) @@ -76330,7 +79971,7 @@ index 000000000000..fce3146378f9 + ), + + TP_fast_assign( -+ __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->in_traverse_all = in_traverse_all; + __entry->reason = reason; @@ -76348,8 +79989,8 @@ index 000000000000..fce3146378f9 + __entry->want_pos_snapshot = want_pos->snapshot; + ), + -+ TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", -+ (void *) __entry->trans_ip, ++ TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", ++ __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->in_traverse_all, + __entry->reason, @@ -76366,43 +80007,86 @@ index 000000000000..fce3146378f9 +); + +TRACE_EVENT(trans_restart_would_deadlock_write, -+ TP_PROTO(unsigned long trans_ip), -+ TP_ARGS(trans_ip), ++ TP_PROTO(const char *trans_fn), ++ TP_ARGS(trans_fn), + + TP_STRUCT__entry( -+ __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + ), + + TP_fast_assign( -+ __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + ), + -+ TP_printk("%ps", (void *) __entry->trans_ip) ++ TP_printk("%s", __entry->trans_fn) +); + +TRACE_EVENT(trans_restart_mem_realloced, -+ TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, + unsigned long bytes), -+ TP_ARGS(trans_ip, caller_ip, bytes), ++ TP_ARGS(trans_fn, caller_ip, bytes), + + TP_STRUCT__entry( -+ __field(unsigned long, trans_ip ) ++ __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + __field(unsigned long, bytes ) + ), + + TP_fast_assign( -+ __entry->trans_ip = trans_ip; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->bytes = bytes; + ), + -+ TP_printk("%ps %pS bytes %lu", -+ (void *) __entry->trans_ip, ++ TP_printk("%s %pS bytes %lu", ++ __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->bytes) +); + ++TRACE_EVENT(trans_restart_key_cache_key_realloced, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ unsigned old_u64s, ++ unsigned new_u64s), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos, old_u64s, new_u64s), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(enum btree_id, btree_id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, snapshot ) ++ __field(u32, old_u64s ) ++ __field(u32, new_u64s ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->inode = pos->inode; ++ __entry->offset = pos->offset; ++ __entry->snapshot = pos->snapshot; ++ __entry->old_u64s = old_u64s; ++ __entry->new_u64s = new_u64s; ++ ), ++ ++ TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ bch2_btree_ids[__entry->btree_id], ++ __entry->inode, ++ __entry->offset, ++ __entry->snapshot, ++ __entry->old_u64s, ++ __entry->new_u64s) ++); ++ +#endif /* _TRACE_BCACHE_H */ + +/* This part must be outside protection */ @@ -76440,10 +80124,10 @@ index d51cabf28f38..cadbf6520c4b 100644 obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o +obj-$(CONFIG_SIXLOCKS) += six.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c -index bf1c00c881e4..22e7c15ebab4 100644 +index e6a282bc1665..641d3d50780f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c -@@ -6438,6 +6438,26 @@ void debug_check_no_locks_held(void) +@@ -6443,6 +6443,26 @@ void debug_check_no_locks_held(void) } EXPORT_SYMBOL_GPL(debug_check_no_locks_held); @@ -77236,7 +80920,7 @@ index 000000000000..fca1208720b6 +} +EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); diff --git a/kernel/module.c b/kernel/module.c -index 5c26a76e800b..155f9c1536b0 100644 +index ef79f4dbda87..17af02711eda 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2835,9 +2835,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) @@ -77251,10 +80935,10 @@ index 5c26a76e800b..155f9c1536b0 100644 bool __weak module_init_section(const char *name) diff --git a/lib/Kconfig b/lib/Kconfig -index 5e7165e6a346..32786f287f46 100644 +index baa977e003b7..aa1c7f286bad 100644 --- a/lib/Kconfig +++ b/lib/Kconfig -@@ -481,6 +481,9 @@ config ASSOCIATIVE_ARRAY +@@ -485,6 +485,9 @@ config ASSOCIATIVE_ARRAY for more information. @@ -77265,10 +80949,10 @@ index 5e7165e6a346..32786f287f46 100644 bool depends on !NO_IOMEM diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 2a9b6dcdac4f..1c2f33b8a526 100644 +index 1699b2124558..5ce59387de9e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug -@@ -1666,6 +1666,15 @@ config DEBUG_CREDENTIALS +@@ -1672,6 +1672,15 @@ config DEBUG_CREDENTIALS source "kernel/rcu/Kconfig.debug" @@ -77285,7 +80969,7 @@ index 2a9b6dcdac4f..1c2f33b8a526 100644 bool "Force round-robin CPU selection for unbound work items" depends on DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile -index a841be5244ac..9759073b2c27 100644 +index 0868cb67e5b0..f8b06256f03a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -239,6 +239,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o @@ -77451,10 +81135,10 @@ index f25eb111c051..7dfa88282b00 100644 objs_per_page; if (i == GENRADIX_ARY) diff --git a/mm/filemap.c b/mm/filemap.c -index dae481293b5d..0dc615e48c8c 100644 +index dbc461703ff4..75a38c04c94f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c -@@ -2186,6 +2186,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, +@@ -2187,6 +2187,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return ret; } @@ -77492,10 +81176,10 @@ index 02d2427b8f9e..2f4894f9b7e3 100644 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size diff --git a/mm/vmalloc.c b/mm/vmalloc.c -index e8a807c78110..5fb4311fdc07 100644 +index 8375eecc55de..9f7ac3851caa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c -@@ -3200,6 +3200,27 @@ void *vzalloc_node(unsigned long size, int node) +@@ -3201,6 +3201,27 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); @@ -77523,3 +81207,6 @@ index e8a807c78110..5fb4311fdc07 100644 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) +-- +2.38.1.385.g3b08839926 + diff --git a/linux-tkg-patches/6.0/0008-6.0-bcachefs.patch b/linux-tkg-patches/6.0/0008-6.0-bcachefs.patch index e3d463e..1a153c2 100644 --- a/linux-tkg-patches/6.0/0008-6.0-bcachefs.patch +++ b/linux-tkg-patches/6.0/0008-6.0-bcachefs.patch @@ -1,12 +1,13 @@ -From 3affa82eb5db6a292711c1d9febffa8ff7332f55 Mon Sep 17 00:00:00 2001 +From 3522899e342d546a6da80a5a6a621e9bd69e8536 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 7 Oct 2022 19:45:17 +0200 +Date: Sun, 6 Nov 2022 10:46:22 +0100 Subject: [PATCH] bcachefs Signed-off-by: Peter Jung --- .github/ISSUE_TEMPLATE/bug_report.md | 61 + Documentation/core-api/printk-formats.rst | 22 + + MAINTAINERS | 9 + arch/powerpc/kernel/process.c | 16 +- arch/powerpc/kernel/security.c | 75 +- arch/powerpc/platforms/pseries/papr_scm.c | 34 +- @@ -26,48 +27,50 @@ Signed-off-by: Peter Jung drivers/pci/p2pdma.c | 21 +- fs/Kconfig | 1 + fs/Makefile | 1 + - fs/bcachefs/Kconfig | 59 + + fs/bcachefs/Kconfig | 60 + fs/bcachefs/Makefile | 70 + fs/bcachefs/acl.c | 406 ++ fs/bcachefs/acl.h | 58 + - fs/bcachefs/alloc_background.c | 1551 ++++++++ + fs/bcachefs/alloc_background.c | 1585 ++++++++ fs/bcachefs/alloc_background.h | 183 + - fs/bcachefs/alloc_foreground.c | 1383 +++++++ - fs/bcachefs/alloc_foreground.h | 181 + - fs/bcachefs/alloc_types.h | 87 + - fs/bcachefs/backpointers.c | 898 +++++ - fs/bcachefs/backpointers.h | 38 + - fs/bcachefs/bcachefs.h | 1001 +++++ - fs/bcachefs/bcachefs_format.h | 2122 ++++++++++ + fs/bcachefs/alloc_foreground.c | 1398 +++++++ + fs/bcachefs/alloc_foreground.h | 174 + + fs/bcachefs/alloc_types.h | 92 + + fs/bcachefs/backpointers.c | 1103 +++++ + fs/bcachefs/backpointers.h | 58 + + fs/bcachefs/bbpos.h | 48 + + fs/bcachefs/bcachefs.h | 1019 +++++ + fs/bcachefs/bcachefs_format.h | 2172 ++++++++++ fs/bcachefs/bcachefs_ioctl.h | 368 ++ - fs/bcachefs/bkey.c | 1203 ++++++ - fs/bcachefs/bkey.h | 571 +++ - fs/bcachefs/bkey_buf.h | 60 + - fs/bcachefs/bkey_methods.c | 503 +++ + fs/bcachefs/bkey.c | 1098 +++++ + fs/bcachefs/bkey.h | 666 +++ + fs/bcachefs/bkey_buf.h | 61 + + fs/bcachefs/bkey_cmp.h | 129 + + fs/bcachefs/bkey_methods.c | 505 +++ fs/bcachefs/bkey_methods.h | 175 + - fs/bcachefs/bkey_sort.c | 198 + + fs/bcachefs/bkey_sort.c | 199 + fs/bcachefs/bkey_sort.h | 44 + - fs/bcachefs/bset.c | 1598 ++++++++ - fs/bcachefs/bset.h | 615 +++ - fs/bcachefs/btree_cache.c | 1149 ++++++ - fs/bcachefs/btree_cache.h | 105 + + fs/bcachefs/bset.c | 1601 ++++++++ + fs/bcachefs/bset.h | 521 +++ + fs/bcachefs/btree_cache.c | 1204 ++++++ + fs/bcachefs/btree_cache.h | 106 + fs/bcachefs/btree_gc.c | 2106 ++++++++++ fs/bcachefs/btree_gc.h | 112 + - fs/bcachefs/btree_io.c | 2154 +++++++++++ - fs/bcachefs/btree_io.h | 222 ++ - fs/bcachefs/btree_iter.c | 3043 +++++++++++++++ - fs/bcachefs/btree_iter.h | 564 +++ - fs/bcachefs/btree_key_cache.c | 983 +++++ - fs/bcachefs/btree_key_cache.h | 47 + - fs/bcachefs/btree_locking.c | 676 ++++ - fs/bcachefs/btree_locking.h | 418 ++ - fs/bcachefs/btree_types.h | 696 ++++ + fs/bcachefs/btree_io.c | 2203 ++++++++++ + fs/bcachefs/btree_io.h | 228 ++ + fs/bcachefs/btree_iter.c | 3121 ++++++++++++++ + fs/bcachefs/btree_iter.h | 599 +++ + fs/bcachefs/btree_key_cache.c | 1034 +++++ + fs/bcachefs/btree_key_cache.h | 48 + + fs/bcachefs/btree_locking.c | 679 ++++ + fs/bcachefs/btree_locking.h | 419 ++ + fs/bcachefs/btree_types.h | 708 ++++ fs/bcachefs/btree_update.h | 158 + - fs/bcachefs/btree_update_interior.c | 2352 ++++++++++++ - fs/bcachefs/btree_update_interior.h | 322 ++ - fs/bcachefs/btree_update_leaf.c | 1745 +++++++++ - fs/bcachefs/buckets.c | 2113 ++++++++++ - fs/bcachefs/buckets.h | 300 ++ + fs/bcachefs/btree_update_interior.c | 2437 +++++++++++ + fs/bcachefs/btree_update_interior.h | 324 ++ + fs/bcachefs/btree_update_leaf.c | 1760 ++++++++ + fs/bcachefs/buckets.c | 2117 ++++++++++ + fs/bcachefs/buckets.h | 326 ++ fs/bcachefs/buckets_types.h | 103 + fs/bcachefs/buckets_waiting_for_journal.c | 167 + fs/bcachefs/buckets_waiting_for_journal.h | 15 + @@ -75,7 +78,7 @@ Signed-off-by: Peter Jung fs/bcachefs/chardev.c | 760 ++++ fs/bcachefs/chardev.h | 31 + fs/bcachefs/checksum.c | 712 ++++ - fs/bcachefs/checksum.h | 204 + + fs/bcachefs/checksum.h | 212 + fs/bcachefs/clock.c | 191 + fs/bcachefs/clock.h | 38 + fs/bcachefs/clock_types.h | 37 + @@ -84,103 +87,103 @@ Signed-off-by: Peter Jung fs/bcachefs/counters.c | 107 + fs/bcachefs/counters.h | 17 + fs/bcachefs/darray.h | 77 + - fs/bcachefs/data_update.c | 373 ++ - fs/bcachefs/data_update.h | 38 + - fs/bcachefs/debug.c | 831 ++++ + fs/bcachefs/data_update.c | 387 ++ + fs/bcachefs/data_update.h | 41 + + fs/bcachefs/debug.c | 811 ++++ fs/bcachefs/debug.h | 30 + fs/bcachefs/dirent.c | 565 +++ fs/bcachefs/dirent.h | 67 + fs/bcachefs/disk_groups.c | 505 +++ fs/bcachefs/disk_groups.h | 91 + - fs/bcachefs/ec.c | 1673 ++++++++ - fs/bcachefs/ec.h | 230 ++ + fs/bcachefs/ec.c | 1680 ++++++++ + fs/bcachefs/ec.h | 224 ++ fs/bcachefs/ec_types.h | 46 + - fs/bcachefs/errcode.c | 62 + - fs/bcachefs/errcode.h | 96 + - fs/bcachefs/error.c | 218 ++ - fs/bcachefs/error.h | 222 ++ + fs/bcachefs/errcode.c | 63 + + fs/bcachefs/errcode.h | 97 + + fs/bcachefs/error.c | 221 + + fs/bcachefs/error.h | 222 + fs/bcachefs/extent_update.c | 178 + fs/bcachefs/extent_update.h | 12 + - fs/bcachefs/extents.c | 1324 +++++++ - fs/bcachefs/extents.h | 685 ++++ + fs/bcachefs/extents.c | 1324 ++++++ + fs/bcachefs/extents.h | 689 ++++ fs/bcachefs/extents_types.h | 40 + fs/bcachefs/eytzinger.h | 281 ++ fs/bcachefs/fifo.h | 127 + - fs/bcachefs/fs-common.c | 496 +++ + fs/bcachefs/fs-common.c | 501 +++ fs/bcachefs/fs-common.h | 43 + - fs/bcachefs/fs-io.c | 3421 +++++++++++++++++ + fs/bcachefs/fs-io.c | 3577 +++++++++++++++++ fs/bcachefs/fs-io.h | 54 + - fs/bcachefs/fs-ioctl.c | 539 +++ + fs/bcachefs/fs-ioctl.c | 555 +++ fs/bcachefs/fs-ioctl.h | 81 + - fs/bcachefs/fs.c | 1942 ++++++++++ + fs/bcachefs/fs.c | 1941 +++++++++ fs/bcachefs/fs.h | 208 + - fs/bcachefs/fsck.c | 2395 ++++++++++++ + fs/bcachefs/fsck.c | 2395 +++++++++++ fs/bcachefs/fsck.h | 8 + - fs/bcachefs/inode.c | 771 ++++ - fs/bcachefs/inode.h | 189 + - fs/bcachefs/io.c | 2436 ++++++++++++ - fs/bcachefs/io.h | 190 + - fs/bcachefs/io_types.h | 161 + + fs/bcachefs/inode.c | 892 ++++ + fs/bcachefs/inode.h | 202 + + fs/bcachefs/io.c | 2469 ++++++++++++ + fs/bcachefs/io.h | 183 + + fs/bcachefs/io_types.h | 156 + fs/bcachefs/journal.c | 1436 +++++++ - fs/bcachefs/journal.h | 521 +++ - fs/bcachefs/journal_io.c | 1759 +++++++++ + fs/bcachefs/journal.h | 540 +++ + fs/bcachefs/journal_io.c | 1807 +++++++++ fs/bcachefs/journal_io.h | 59 + fs/bcachefs/journal_reclaim.c | 853 ++++ fs/bcachefs/journal_reclaim.h | 86 + - fs/bcachefs/journal_sb.c | 220 ++ + fs/bcachefs/journal_sb.c | 220 + fs/bcachefs/journal_sb.h | 24 + fs/bcachefs/journal_seq_blacklist.c | 322 ++ fs/bcachefs/journal_seq_blacklist.h | 22 + fs/bcachefs/journal_types.h | 340 ++ - fs/bcachefs/keylist.c | 67 + - fs/bcachefs/keylist.h | 76 + + fs/bcachefs/keylist.c | 68 + + fs/bcachefs/keylist.h | 75 + fs/bcachefs/keylist_types.h | 16 + fs/bcachefs/lru.c | 206 + fs/bcachefs/lru.h | 19 + fs/bcachefs/migrate.c | 186 + fs/bcachefs/migrate.h | 7 + - fs/bcachefs/move.c | 954 +++++ + fs/bcachefs/move.c | 1011 +++++ fs/bcachefs/move.h | 67 + fs/bcachefs/move_types.h | 19 + fs/bcachefs/movinggc.c | 285 ++ fs/bcachefs/movinggc.h | 10 + fs/bcachefs/opts.c | 578 +++ fs/bcachefs/opts.h | 509 +++ - fs/bcachefs/quota.c | 823 ++++ + fs/bcachefs/quota.c | 978 +++++ fs/bcachefs/quota.h | 71 + fs/bcachefs/quota_types.h | 43 + fs/bcachefs/rebalance.c | 362 ++ fs/bcachefs/rebalance.h | 28 + fs/bcachefs/rebalance_types.h | 26 + - fs/bcachefs/recovery.c | 1587 ++++++++ + fs/bcachefs/recovery.c | 1606 ++++++++ fs/bcachefs/recovery.h | 58 + fs/bcachefs/reflink.c | 422 ++ fs/bcachefs/reflink.h | 76 + - fs/bcachefs/replicas.c | 1071 ++++++ - fs/bcachefs/replicas.h | 106 + - fs/bcachefs/replicas_types.h | 10 + + fs/bcachefs/replicas.c | 1071 +++++ + fs/bcachefs/replicas.h | 107 + + fs/bcachefs/replicas_types.h | 11 + fs/bcachefs/siphash.c | 173 + fs/bcachefs/siphash.h | 87 + fs/bcachefs/str_hash.h | 370 ++ - fs/bcachefs/subvolume.c | 1110 ++++++ + fs/bcachefs/subvolume.c | 1111 +++++ fs/bcachefs/subvolume.h | 137 + fs/bcachefs/subvolume_types.h | 9 + - fs/bcachefs/super-io.c | 1603 ++++++++ + fs/bcachefs/super-io.c | 1601 ++++++++ fs/bcachefs/super-io.h | 126 + - fs/bcachefs/super.c | 1964 ++++++++++ + fs/bcachefs/super.c | 1961 +++++++++ fs/bcachefs/super.h | 264 ++ fs/bcachefs/super_types.h | 51 + - fs/bcachefs/sysfs.c | 954 +++++ + fs/bcachefs/sysfs.c | 963 +++++ fs/bcachefs/sysfs.h | 48 + - fs/bcachefs/tests.c | 976 +++++ + fs/bcachefs/tests.c | 973 +++++ fs/bcachefs/tests.h | 15 + fs/bcachefs/trace.c | 14 + - fs/bcachefs/util.c | 993 +++++ - fs/bcachefs/util.h | 787 ++++ + fs/bcachefs/util.c | 1104 +++++ + fs/bcachefs/util.h | 793 ++++ fs/bcachefs/varint.c | 121 + fs/bcachefs/varint.h | 11 + fs/bcachefs/vstructs.h | 63 + - fs/bcachefs/xattr.c | 650 ++++ + fs/bcachefs/xattr.c | 654 +++ fs/bcachefs/xattr.h | 50 + fs/d_path.c | 35 + fs/dcache.c | 10 +- @@ -197,23 +200,24 @@ Signed-off-by: Peter Jung include/linux/list_bl.h | 22 + include/linux/lockdep.h | 10 + include/linux/lockdep_types.h | 2 +- + include/linux/mean_and_variance.h | 170 + include/linux/pretty-printers.h | 10 + include/linux/printbuf.h | 306 ++ include/linux/sched.h | 1 + include/linux/seq_buf.h | 162 - include/linux/shrinker.h | 9 +- - include/linux/six.h | 222 ++ + include/linux/six.h | 222 + include/linux/string.h | 5 + include/linux/string_helpers.h | 8 +- include/linux/trace_events.h | 2 +- include/linux/trace_seq.h | 17 +- include/linux/vmalloc.h | 1 + - include/trace/events/bcachefs.h | 1101 ++++++ + include/trace/events/bcachefs.h | 1105 +++++ init/init_task.c | 1 + kernel/Kconfig.locks | 3 + kernel/locking/Makefile | 1 + kernel/locking/lockdep.c | 45 + - kernel/locking/six.c | 748 ++++ + kernel/locking/six.c | 757 ++++ kernel/module/main.c | 4 +- kernel/stacktrace.c | 2 + kernel/trace/trace.c | 45 +- @@ -224,20 +228,25 @@ Signed-off-by: Peter Jung kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_seq.c | 111 +- lib/Kconfig | 3 + - lib/Kconfig.debug | 9 + + lib/Kconfig.debug | 18 + lib/Makefile | 8 +- {drivers/md/bcache => lib}/closure.c | 35 +- lib/errname.c | 1 + lib/generic-radix-tree.c | 76 +- lib/hexdump.c | 246 +- + lib/math/Kconfig | 3 + + lib/math/Makefile | 2 + + lib/math/mean_and_variance.c | 178 + + lib/math/mean_and_variance_test.c | 152 + lib/pretty-printers.c | 60 + lib/printbuf.c | 368 ++ lib/seq_buf.c | 397 -- lib/string_helpers.c | 224 +- lib/test_hexdump.c | 30 +- lib/test_printf.c | 33 +- - lib/vsprintf.c | 1740 ++++----- + lib/vsprintf.c | 1741 ++++---- mm/Makefile | 2 +- + mm/filemap.c | 1 + mm/memcontrol.c | 54 +- mm/nommu.c | 18 + mm/oom_kill.c | 23 - @@ -247,7 +256,7 @@ Signed-off-by: Peter Jung mm/vmalloc.c | 21 + mm/vmscan.c | 105 +- tools/testing/nvdimm/test/ndtest.c | 22 +- - 242 files changed, 85340 insertions(+), 2187 deletions(-) + 251 files changed, 87492 insertions(+), 2187 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 fs/bcachefs/Kconfig create mode 100644 fs/bcachefs/Makefile @@ -260,12 +269,14 @@ Signed-off-by: Peter Jung create mode 100644 fs/bcachefs/alloc_types.h create mode 100644 fs/bcachefs/backpointers.c create mode 100644 fs/bcachefs/backpointers.h + create mode 100644 fs/bcachefs/bbpos.h create mode 100644 fs/bcachefs/bcachefs.h create mode 100644 fs/bcachefs/bcachefs_format.h create mode 100644 fs/bcachefs/bcachefs_ioctl.h create mode 100644 fs/bcachefs/bkey.c create mode 100644 fs/bcachefs/bkey.h create mode 100644 fs/bcachefs/bkey_buf.h + create mode 100644 fs/bcachefs/bkey_cmp.h create mode 100644 fs/bcachefs/bkey_methods.c create mode 100644 fs/bcachefs/bkey_methods.h create mode 100644 fs/bcachefs/bkey_sort.c @@ -406,6 +417,7 @@ Signed-off-by: Peter Jung create mode 100644 fs/bcachefs/xattr.c create mode 100644 fs/bcachefs/xattr.h rename {drivers/md/bcache => include/linux}/closure.h (94%) + create mode 100644 include/linux/mean_and_variance.h create mode 100644 include/linux/pretty-printers.h create mode 100644 include/linux/printbuf.h delete mode 100644 include/linux/seq_buf.h @@ -413,6 +425,8 @@ Signed-off-by: Peter Jung create mode 100644 include/trace/events/bcachefs.h create mode 100644 kernel/locking/six.c rename {drivers/md/bcache => lib}/closure.c (88%) + create mode 100644 lib/math/mean_and_variance.c + create mode 100644 lib/math/mean_and_variance_test.c create mode 100644 lib/pretty-printers.c create mode 100644 lib/printbuf.c delete mode 100644 lib/seq_buf.c @@ -486,7 +500,7 @@ index 000000000000..8af34357dd98 +* provide the output of `bcachefs list_journal -a | zstd -f -T0 -o ../journal.log.zst` +*compress & upload all the `metdata.dump.*` files from: bcachefs dump -o metadata.dump diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst -index 5e89497ba314..4f4a35b3aadc 100644 +index 5e89497ba314..608eb514f171 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -625,6 +625,28 @@ Examples:: @@ -503,7 +517,7 @@ index 5e89497ba314..4f4a35b3aadc 100644 + +For calling generic pretty printers. A pretty printer is a function that takes +as its first argument a pointer to a printbuf, and then zero or more additional -+pointer arguments. For example: ++pointer arguments. For example:: + + void foo_to_text(struct printbuf *out, struct foo *foo) + { @@ -518,6 +532,26 @@ index 5e89497ba314..4f4a35b3aadc 100644 Thanks ====== +diff --git a/MAINTAINERS b/MAINTAINERS +index 72b9654f764c..06bb50e760df 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -12505,6 +12505,15 @@ F: Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt + F: drivers/net/ieee802154/mcr20a.c + F: drivers/net/ieee802154/mcr20a.h + ++MEAN AND VARIANCE LIBRARY ++M: Daniel B. Hill ++M: Kent Overstreet ++S: Maintained ++T: git https://github.com/YellowOnion/linux/ ++F: include/linux/mean_and_variance.h ++F: lib/math/mean_and_variance.c ++F: lib/math/mean_and_variance_test.c ++ + MEASUREMENT COMPUTING CIO-DAC IIO DRIVER + M: William Breathitt Gray + L: linux-iio@vger.kernel.org diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0fbda89cd1bb..05654dbeb2c4 100644 --- a/arch/powerpc/kernel/process.c @@ -880,7 +914,7 @@ index f276aff521e8..50c12711a249 100644 ret = rdtgroup_setup_root(); if (ret) diff --git a/block/bio.c b/block/bio.c -index 3d3a2678fea2..ed9a4df9ea36 100644 +index 77e3b764a078..cdb26dc0d638 100644 --- a/block/bio.c +++ b/block/bio.c @@ -582,15 +582,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) @@ -902,7 +936,7 @@ index 3d3a2678fea2..ed9a4df9ea36 100644 /** * bio_truncate - truncate the bio to small size of @new_size -@@ -1200,7 +1200,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +@@ -1198,7 +1198,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size, left; unsigned len, i = 0; @@ -911,7 +945,7 @@ index 3d3a2678fea2..ed9a4df9ea36 100644 int ret = 0; /* -@@ -1225,10 +1225,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +@@ -1223,10 +1223,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); @@ -927,7 +961,7 @@ index 3d3a2678fea2..ed9a4df9ea36 100644 if (unlikely(!size)) { ret = -EFAULT; goto out; -@@ -1437,6 +1439,7 @@ void bio_set_pages_dirty(struct bio *bio) +@@ -1435,6 +1437,7 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } @@ -935,7 +969,7 @@ index 3d3a2678fea2..ed9a4df9ea36 100644 /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. -@@ -1496,6 +1499,7 @@ void bio_check_pages_dirty(struct bio *bio) +@@ -1494,6 +1497,7 @@ void bio_check_pages_dirty(struct bio *bio) spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } @@ -956,7 +990,7 @@ index 651057c4146b..10bf4ac26bed 100644 /** * blk_sync_queue - cancel any pending callbacks on a queue diff --git a/block/blk.h b/block/blk.h -index d7142c4d2fef..1b22813ee530 100644 +index 52432eab621e..2af5287b97de 100644 --- a/block/blk.h +++ b/block/blk.h @@ -250,7 +250,6 @@ static inline void blk_integrity_del(struct gendisk *disk) @@ -1296,10 +1330,10 @@ index 93b80529f8e8..2b8d04016a20 100644 obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig new file mode 100644 -index 000000000000..008886967841 +index 000000000000..2b9387ac1bca --- /dev/null +++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,59 @@ +@@ -0,0 +1,60 @@ + +config BCACHEFS_FS + tristate "bcachefs filesystem support" @@ -1325,6 +1359,7 @@ index 000000000000..008886967841 + select XXHASH + select SRCU + select SYMBOLIC_ERRNAME ++ select MEAN_AND_VARIANCE + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. @@ -1437,7 +1472,7 @@ index 000000000000..8124d356baa1 +bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c new file mode 100644 -index 000000000000..5c6ccf685094 +index 000000000000..9592541f7b5c --- /dev/null +++ b/fs/bcachefs/acl.c @@ -0,0 +1,406 @@ @@ -1616,7 +1651,7 @@ index 000000000000..5c6ccf685094 + bkey_xattr_init(&xattr->k_i); + xattr->k.u64s = u64s; + xattr->v.x_type = acl_to_xattr_type(type); -+ xattr->v.x_name_len = 0, ++ xattr->v.x_name_len = 0; + xattr->v.x_val_len = cpu_to_le16(acl_len); + + acl_header = xattr_val(&xattr->v); @@ -1913,10 +1948,10 @@ index 000000000000..2d76a4897ba8 +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 -index 000000000000..d0d7690a4940 +index 000000000000..ccd3f72ae19f --- /dev/null +++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,1551 @@ +@@ -0,0 +1,1585 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -2129,31 +2164,6 @@ index 000000000000..d0d7690a4940 + return ret; +} + -+struct bkey_i_alloc_v4 * -+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos pos) -+{ -+ struct bkey_s_c k; -+ struct bkey_i_alloc_v4 *a; -+ int ret; -+ -+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, -+ BTREE_ITER_WITH_UPDATES| -+ BTREE_ITER_CACHED| -+ BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) { -+ bch2_trans_iter_exit(trans, iter); -+ return ERR_PTR(ret); -+ } -+ -+ a = bch2_alloc_to_v4_mut(trans, k); -+ if (IS_ERR(a)) -+ bch2_trans_iter_exit(trans, iter); -+ return a; -+} -+ +static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) +{ + unsigned i, bytes = offsetof(struct bch_alloc, data); @@ -2223,6 +2233,18 @@ index 000000000000..d0d7690a4940 + return -EINVAL; + } + ++ if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { ++ unsigned i, bp_len = 0; ++ ++ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) ++ bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len; ++ ++ if (bp_len > a.v->dirty_sectors) { ++ prt_printf(err, "too many backpointers"); ++ return -EINVAL; ++ } ++ } ++ + if (rw == WRITE) { + if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { + prt_printf(err, "invalid data type (got %u should be %u)", @@ -2394,12 +2416,13 @@ index 000000000000..d0d7690a4940 + } +} + -+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) ++static noinline struct bkey_i_alloc_v4 * ++__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) +{ ++ struct bkey_i_alloc_v4 *ret; + unsigned bytes = k.k->type == KEY_TYPE_alloc_v4 + ? bkey_bytes(k.k) + : sizeof(struct bkey_i_alloc_v4); -+ struct bkey_i_alloc_v4 *ret; + + /* + * Reserve space for one more backpointer here: @@ -2410,20 +2433,18 @@ index 000000000000..d0d7690a4940 + return ret; + + if (k.k->type == KEY_TYPE_alloc_v4) { ++ struct bch_backpointer *src, *dst; ++ + bkey_reassemble(&ret->k_i, k); + -+ if (BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v) < BCH_ALLOC_V4_U64s) { -+ struct bch_backpointer *src, *dst; ++ src = alloc_v4_backpointers(&ret->v); ++ SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); ++ dst = alloc_v4_backpointers(&ret->v); + -+ src = alloc_v4_backpointers(&ret->v); -+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); -+ dst = alloc_v4_backpointers(&ret->v); -+ -+ memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) * -+ sizeof(struct bch_backpointer)); -+ memset(src, 0, dst - src); -+ set_alloc_v4_u64s(ret); -+ } ++ memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) * ++ sizeof(struct bch_backpointer)); ++ memset(src, 0, dst - src); ++ set_alloc_v4_u64s(ret); + } else { + bkey_alloc_v4_init(&ret->k_i); + ret->k.p = k.k->p; @@ -2432,6 +2453,54 @@ index 000000000000..d0d7690a4940 + return ret; +} + ++static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ if (likely(k.k->type == KEY_TYPE_alloc_v4) && ++ BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) { ++ /* ++ * Reserve space for one more backpointer here: ++ * Not sketchy at doing it this way, nope... ++ */ ++ struct bkey_i_alloc_v4 *ret = ++ bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer)); ++ if (!IS_ERR(ret)) ++ bkey_reassemble(&ret->k_i, k); ++ return ret; ++ } ++ ++ return __bch2_alloc_to_v4_mut(trans, k); ++} ++ ++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ return bch2_alloc_to_v4_mut_inlined(trans, k); ++} ++ ++struct bkey_i_alloc_v4 * ++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos pos) ++{ ++ struct bkey_s_c k; ++ struct bkey_i_alloc_v4 *a; ++ int ret; ++ ++ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return ERR_PTR(ret); ++ } ++ ++ a = bch2_alloc_to_v4_mut_inlined(trans, k); ++ if (IS_ERR(a)) ++ bch2_trans_iter_exit(trans, iter); ++ return a; ++} ++ +int bch2_alloc_read(struct bch_fs *c) +{ + struct btree_trans trans; @@ -3470,7 +3539,7 @@ index 000000000000..d0d7690a4940 +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 -index 000000000000..044bc72992d4 +index 000000000000..ee683bdde956 --- /dev/null +++ b/fs/bcachefs/alloc_background.h @@ -0,0 +1,183 @@ @@ -3579,34 +3648,34 @@ index 000000000000..044bc72992d4 +void bch2_alloc_v4_swab(struct bkey_s); +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + -+#define bch2_bkey_ops_alloc (struct bkey_ops) { \ ++#define bch2_bkey_ops_alloc ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ -+} ++}) + -+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ ++#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ -+} ++}) + -+#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ ++#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v3_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ -+} ++}) + -+#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) { \ ++#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v4_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .swab = bch2_alloc_v4_swab, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ -+} ++}) + +static inline bool bkey_is_alloc(const struct bkey *k) +{ @@ -3659,10 +3728,10 @@ index 000000000000..044bc72992d4 +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 -index 000000000000..e89999cf9238 +index 000000000000..55708d2da960 --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,1383 @@ +@@ -0,0 +1,1398 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2012 Google, Inc. @@ -3679,6 +3748,7 @@ index 000000000000..e89999cf9238 +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" ++#include "backpointers.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_gc.h" @@ -3998,6 +4068,29 @@ index 000000000000..e89999cf9238 + goto err; + } + ++ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { ++ struct bch_backpointer bp; ++ u64 bp_offset = 0; ++ ++ ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, ++ &bp_offset, &bp, ++ BTREE_ITER_NOPRESERVE); ++ if (ret) { ++ ob = ERR_PTR(ret); ++ goto err; ++ } ++ ++ if (bp_offset != U64_MAX) { ++ /* ++ * Bucket may have data in it - we don't call ++ * bc2h_trans_inconnsistent() because fsck hasn't ++ * finished yet ++ */ ++ ob = NULL; ++ goto err; ++ } ++ } ++ + ob = __try_alloc_bucket(c, ca, b, reserve, &a, + skipped_open, + skipped_need_journal_commit, @@ -4154,16 +4247,16 @@ index 000000000000..e89999cf9238 + * bch_bucket_alloc - allocate a single bucket from a specific device + * + * Returns index of bucket on success, 0 on failure -+ * */ ++ */ +static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, -+ struct closure *cl) ++ struct closure *cl, ++ struct bch_dev_usage *usage) +{ + struct bch_fs *c = trans->c; + struct open_bucket *ob = NULL; -+ struct bch_dev_usage usage; + bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized); + u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor; + u64 avail; @@ -4174,16 +4267,16 @@ index 000000000000..e89999cf9238 + u64 skipped_nouse = 0; + bool waiting = false; +again: -+ usage = bch2_dev_usage_read(ca); -+ avail = dev_buckets_free(ca, usage, reserve); ++ bch2_dev_usage_read_fast(ca, usage); ++ avail = dev_buckets_free(ca, *usage, reserve); + -+ if (usage.d[BCH_DATA_need_discard].buckets > avail) ++ if (usage->d[BCH_DATA_need_discard].buckets > avail) + bch2_do_discards(c); + -+ if (usage.d[BCH_DATA_need_gc_gens].buckets > avail) ++ if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) + bch2_do_gc_gens(c); + -+ if (should_invalidate_buckets(ca, usage)) ++ if (should_invalidate_buckets(ca, *usage)) + bch2_do_invalidates(c); + + if (!avail) { @@ -4242,10 +4335,10 @@ index 000000000000..e89999cf9238 + if (!IS_ERR(ob)) + trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve], + may_alloc_partial, ob->bucket); -+ else ++ else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) + trace_and_count(c, bucket_alloc_fail, + ca, bch2_alloc_reserves[reserve], -+ usage.d[BCH_DATA_free].buckets, ++ usage->d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), @@ -4264,11 +4357,12 @@ index 000000000000..e89999cf9238 + bool may_alloc_partial, + struct closure *cl) +{ ++ struct bch_dev_usage usage; + struct open_bucket *ob; + + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, -+ may_alloc_partial, cl))); ++ may_alloc_partial, cl, &usage))); + return ob; +} + @@ -4295,8 +4389,9 @@ index 000000000000..e89999cf9238 + return ret; +} + -+void bch2_dev_stripe_increment(struct bch_dev *ca, -+ struct dev_stripe_state *stripe) ++static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, ++ struct dev_stripe_state *stripe, ++ struct bch_dev_usage *usage) +{ + u64 *v = stripe->next_alloc + ca->dev_idx; + u64 free_space = dev_buckets_available(ca, RESERVE_none); @@ -4315,6 +4410,15 @@ index 000000000000..e89999cf9238 + *v = *v < scale ? 0 : *v - scale; +} + ++void bch2_dev_stripe_increment(struct bch_dev *ca, ++ struct dev_stripe_state *stripe) ++{ ++ struct bch_dev_usage usage; ++ ++ bch2_dev_usage_read_fast(ca, &usage); ++ bch2_dev_stripe_increment_inlined(ca, stripe, &usage); ++} ++ +#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) +#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) + @@ -4359,6 +4463,7 @@ index 000000000000..e89999cf9238 + BUG_ON(*nr_effective >= nr_replicas); + + for (i = 0; i < devs_sorted.nr; i++) { ++ struct bch_dev_usage usage; + struct open_bucket *ob; + + dev = devs_sorted.devs[i]; @@ -4378,9 +4483,9 @@ index 000000000000..e89999cf9238 + } + + ob = bch2_bucket_alloc_trans(trans, ca, reserve, -+ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); ++ flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage); + if (!IS_ERR(ob)) -+ bch2_dev_stripe_increment(ca, stripe); ++ bch2_dev_stripe_increment_inlined(ca, stripe, &usage); + percpu_ref_put(&ca->ref); + + if (IS_ERR(ob)) { @@ -4775,23 +4880,24 @@ index 000000000000..e89999cf9238 + hlist_add_head_rcu(&wp->node, head); + mutex_unlock(&c->write_points_hash_lock); +out: -+ wp->last_used = sched_clock(); ++ wp->last_used = local_clock(); + return wp; +} + +/* + * Get us an open_bucket we can allocate from, return with it locked: + */ -+struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans, -+ unsigned target, -+ unsigned erasure_code, -+ struct write_point_specifier write_point, -+ struct bch_devs_list *devs_have, -+ unsigned nr_replicas, -+ unsigned nr_replicas_required, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *cl) ++int bch2_alloc_sectors_start_trans(struct btree_trans *trans, ++ unsigned target, ++ unsigned erasure_code, ++ struct write_point_specifier write_point, ++ struct bch_devs_list *devs_have, ++ unsigned nr_replicas, ++ unsigned nr_replicas_required, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl, ++ struct write_point **wp_ret) +{ + struct bch_fs *c = trans->c; + struct write_point *wp; @@ -4813,7 +4919,7 @@ index 000000000000..e89999cf9238 + write_points_nr = c->write_points_nr; + have_cache = false; + -+ wp = writepoint_find(trans, write_point.v); ++ *wp_ret = wp = writepoint_find(trans, write_point.v); + + if (wp->data_type == BCH_DATA_user) + ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; @@ -4870,7 +4976,7 @@ index 000000000000..e89999cf9238 + + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + -+ return wp; ++ return 0; +err: + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (ptrs.nr < ARRAY_SIZE(ptrs.v)) @@ -4888,39 +4994,13 @@ index 000000000000..e89999cf9238 + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || + bch2_err_matches(ret, BCH_ERR_freelist_empty)) + return cl -+ ? ERR_PTR(-EAGAIN) -+ : ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc); ++ ? -EAGAIN ++ : -BCH_ERR_ENOSPC_bucket_alloc; + + if (bch2_err_matches(ret, BCH_ERR_insufficient_devices)) -+ return ERR_PTR(-EROFS); -+ -+ return ERR_PTR(ret); -+} -+ -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, -+ unsigned target, -+ unsigned erasure_code, -+ struct write_point_specifier write_point, -+ struct bch_devs_list *devs_have, -+ unsigned nr_replicas, -+ unsigned nr_replicas_required, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct write_point *wp; -+ -+ bch2_trans_do(c, NULL, NULL, 0, -+ PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target, -+ erasure_code, -+ write_point, -+ devs_have, -+ nr_replicas, -+ nr_replicas_required, -+ reserve, -+ flags, cl))); -+ return wp; ++ return -EROFS; + ++ return ret; +} + +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) @@ -4991,6 +5071,10 @@ index 000000000000..e89999cf9238 +{ + mutex_init(&wp->lock); + wp->data_type = type; ++ ++ INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); ++ INIT_LIST_HEAD(&wp->writes); ++ spin_lock_init(&wp->writes_lock); +} + +void bch2_fs_allocator_foreground_init(struct bch_fs *c) @@ -5021,7 +5105,7 @@ index 000000000000..e89999cf9238 + wp < c->write_points + c->write_points_nr; wp++) { + writepoint_init(wp, BCH_DATA_user); + -+ wp->last_used = sched_clock(); ++ wp->last_used = local_clock(); + wp->write_point = (unsigned long) wp; + hlist_add_head_rcu(&wp->node, + writepoint_hash(c, wp->write_point)); @@ -5048,10 +5132,10 @@ index 000000000000..e89999cf9238 +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h new file mode 100644 -index 000000000000..6de63a351fa8 +index 000000000000..16490ffbd2c7 --- /dev/null +++ b/fs/bcachefs/alloc_foreground.h -@@ -0,0 +1,181 @@ +@@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H +#define _BCACHEFS_ALLOC_FOREGROUND_H @@ -5190,22 +5274,15 @@ index 000000000000..6de63a351fa8 + unsigned, unsigned *, bool *, enum alloc_reserve, + unsigned, struct closure *); + -+struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *, -+ unsigned, unsigned, -+ struct write_point_specifier, -+ struct bch_devs_list *, -+ unsigned, unsigned, -+ enum alloc_reserve, -+ unsigned, -+ struct closure *); -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *, -+ unsigned, unsigned, -+ struct write_point_specifier, -+ struct bch_devs_list *, -+ unsigned, unsigned, -+ enum alloc_reserve, -+ unsigned, -+ struct closure *); ++int bch2_alloc_sectors_start_trans(struct btree_trans *, ++ unsigned, unsigned, ++ struct write_point_specifier, ++ struct bch_devs_list *, ++ unsigned, unsigned, ++ enum alloc_reserve, ++ unsigned, ++ struct closure *, ++ struct write_point **); + +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, @@ -5235,10 +5312,10 @@ index 000000000000..6de63a351fa8 +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h new file mode 100644 -index 000000000000..e078584d46f6 +index 000000000000..3df98b22bb15 --- /dev/null +++ b/fs/bcachefs/alloc_types.h -@@ -0,0 +1,87 @@ +@@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_TYPES_H +#define _BCACHEFS_ALLOC_TYPES_H @@ -5319,6 +5396,11 @@ index 000000000000..e078584d46f6 + + struct open_buckets ptrs; + struct dev_stripe_state stripe; ++ ++ struct work_struct index_update_work; ++ ++ struct list_head writes; ++ spinlock_t writes_lock; +}; + +struct write_point_specifier { @@ -5328,19 +5410,20 @@ index 000000000000..e078584d46f6 +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c new file mode 100644 -index 000000000000..955f3ee96cc0 +index 000000000000..614811eafa59 --- /dev/null +++ b/fs/bcachefs/backpointers.c -@@ -0,0 +1,898 @@ +@@ -0,0 +1,1103 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" ++#include "bbpos.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "error.h" + -+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 ++#include + +/* + * Convert from pos in backpointer btree to pos of corresponding bucket in alloc @@ -5363,31 +5446,15 @@ index 000000000000..955f3ee96cc0 + u64 bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); ++ struct bpos ret; + -+ return POS(bucket.inode, -+ (bucket_to_sector(ca, bucket.offset) << -+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); -+} ++ ret = POS(bucket.inode, ++ (bucket_to_sector(ca, bucket.offset) << ++ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + -+void bch2_extent_ptr_to_bp(struct bch_fs *c, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, struct extent_ptr_decoded p, -+ struct bpos *bucket_pos, struct bch_backpointer *bp) -+{ -+ enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user; -+ s64 sectors = level ? btree_sectors(c) : k.k->size; -+ u32 bucket_offset; ++ BUG_ON(bkey_cmp(bucket, bp_pos_to_bucket(c, ret))); + -+ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); -+ *bp = (struct bch_backpointer) { -+ .btree_id = btree_id, -+ .level = level, -+ .data_type = data_type, -+ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + -+ p.crc.offset, -+ .bucket_len = ptr_disk_sectors(sectors, p), -+ .pos = k.k->p, -+ }; ++ return ret; +} + +static bool extent_matches_bp(struct bch_fs *c, @@ -5740,20 +5807,24 @@ index 000000000000..955f3ee96cc0 +int bch2_get_next_backpointer(struct btree_trans *trans, + struct bpos bucket, int gen, + u64 *bp_offset, -+ struct bch_backpointer *dst) ++ struct bch_backpointer *dst, ++ unsigned iter_flags) +{ + struct bch_fs *c = trans->c; -+ struct bpos bp_pos = -+ bucket_pos_to_bp(c, bucket, -+ max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX); -+ struct bpos bp_end_pos = -+ bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); ++ struct bpos bp_pos, bp_end_pos; + struct btree_iter alloc_iter, bp_iter = { NULL }; + struct bkey_s_c k; + struct bkey_s_c_alloc_v4 a; + size_t i; + int ret; + ++ if (*bp_offset == U64_MAX) ++ return 0; ++ ++ bp_pos = bucket_pos_to_bp(c, bucket, ++ max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX); ++ bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); ++ + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&alloc_iter); @@ -5857,7 +5928,7 @@ index 000000000000..955f3ee96cc0 + if (bp.level == c->btree_roots[bp.btree_id].level + 1) + k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key); + -+ if (extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) ++ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + return k; + + bch2_trans_iter_exit(trans, iter); @@ -5907,12 +5978,12 @@ index 000000000000..955f3ee96cc0 + if (IS_ERR(b)) + goto err; + -+ if (extent_matches_bp(c, bp.btree_id, bp.level, -+ bkey_i_to_s_c(&b->key), -+ bucket, bp)) ++ if (b && extent_matches_bp(c, bp.btree_id, bp.level, ++ bkey_i_to_s_c(&b->key), ++ bucket, bp)) + return b; + -+ if (btree_node_will_make_reachable(b)) { ++ if (b && btree_node_will_make_reachable(b)) { + b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + } else { + backpointer_not_found(trans, bucket, bp_offset, bp, @@ -5981,7 +6052,9 @@ index 000000000000..955f3ee96cc0 +static int check_bp_exists(struct btree_trans *trans, + struct bpos bucket_pos, + struct bch_backpointer bp, -+ struct bkey_s_c orig_k) ++ struct bkey_s_c orig_k, ++ struct bpos bucket_start, ++ struct bpos bucket_end) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter, bp_iter = { NULL }; @@ -5989,6 +6062,10 @@ index 000000000000..955f3ee96cc0 + struct bkey_s_c alloc_k, bp_k; + int ret; + ++ if (bpos_cmp(bucket_pos, bucket_start) < 0 || ++ bpos_cmp(bucket_pos, bucket_end) > 0) ++ return 0; ++ + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0); + alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(alloc_k); @@ -6051,7 +6128,9 @@ index 000000000000..955f3ee96cc0 +} + +static int check_extent_to_backpointers(struct btree_trans *trans, -+ struct btree_iter *iter) ++ struct btree_iter *iter, ++ struct bpos bucket_start, ++ struct bpos bucket_end) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs; @@ -6078,7 +6157,7 @@ index 000000000000..955f3ee96cc0 + bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + k, p, &bucket_pos, &bp); + -+ ret = check_bp_exists(trans, bucket_pos, bp, k); ++ ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end); + if (ret) + return ret; + } @@ -6087,7 +6166,9 @@ index 000000000000..955f3ee96cc0 +} + +static int check_btree_root_to_backpointers(struct btree_trans *trans, -+ enum btree_id btree_id) ++ enum btree_id btree_id, ++ struct bpos bucket_start, ++ struct bpos bucket_end) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; @@ -6119,7 +6200,7 @@ index 000000000000..955f3ee96cc0 + bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1, + k, p, &bucket_pos, &bp); + -+ ret = check_bp_exists(trans, bucket_pos, bp, k); ++ ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end); + if (ret) + goto err; + } @@ -6128,60 +6209,222 @@ index 000000000000..955f3ee96cc0 + return ret; +} + -+int bch2_check_extents_to_backpointers(struct bch_fs *c) ++static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) ++{ ++ return (struct bbpos) { ++ .btree = bp.btree_id, ++ .pos = bp.pos, ++ }; ++} ++ ++static size_t btree_nodes_fit_in_ram(struct bch_fs *c) ++{ ++ struct sysinfo i; ++ u64 mem_bytes; ++ ++ si_meminfo(&i); ++ mem_bytes = i.totalram * i.mem_unit; ++ return (mem_bytes >> 1) / btree_bytes(c); ++} ++ ++int bch2_get_btree_in_memory_pos(struct btree_trans *trans, ++ unsigned btree_leaf_mask, ++ unsigned btree_interior_mask, ++ struct bbpos start, struct bbpos *end) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); ++ enum btree_id btree; ++ int ret = 0; ++ ++ for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { ++ unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; ++ ++ if (!((1U << btree) & btree_leaf_mask) && ++ !((1U << btree) & btree_interior_mask)) ++ continue; ++ ++ bch2_trans_node_iter_init(trans, &iter, btree, ++ btree == start.btree ? start.pos : POS_MIN, ++ 0, depth, 0); ++ /* ++ * for_each_btree_key_contineu() doesn't check the return value ++ * from bch2_btree_iter_advance(), which is needed when ++ * iterating over interior nodes where we'll see keys at ++ * SPOS_MAX: ++ */ ++ do { ++ k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); ++ ret = bkey_err(k); ++ if (!k.k || ret) ++ break; ++ ++ --btree_nodes; ++ if (!btree_nodes) { ++ *end = BBPOS(btree, k.k->p); ++ bch2_trans_iter_exit(trans, &iter); ++ return 0; ++ } ++ } while (bch2_btree_iter_advance(&iter)); ++ bch2_trans_iter_exit(trans, &iter); ++ } ++ ++ *end = BBPOS_MAX; ++ return ret; ++} ++ ++static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, ++ struct bpos bucket_start, ++ struct bpos bucket_end) +{ -+ struct btree_trans trans; + struct btree_iter iter; + enum btree_id btree_id; + int ret = 0; + -+ bch2_trans_init(&trans, c, 0, 0); + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { -+ bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0, -+ 0, ++ unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; ++ ++ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, ++ depth, + BTREE_ITER_ALL_LEVELS| + BTREE_ITER_PREFETCH); + + do { -+ ret = commit_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_extent_to_backpointers(&trans, &iter)); ++ ret = commit_do(trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_extent_to_backpointers(trans, &iter, ++ bucket_start, bucket_end)); + if (ret) + break; + } while (!bch2_btree_iter_advance(&iter)); + -+ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_iter_exit(trans, &iter); + + if (ret) + break; + -+ ret = commit_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_btree_root_to_backpointers(&trans, btree_id)); ++ ret = commit_do(trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_btree_root_to_backpointers(trans, btree_id, ++ bucket_start, bucket_end)); + if (ret) + break; + } ++ return ret; ++} ++ ++int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, ++ struct bpos start, struct bpos *end) ++{ ++ struct btree_iter alloc_iter; ++ struct btree_iter bp_iter; ++ struct bkey_s_c alloc_k, bp_k; ++ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); ++ bool alloc_end = false, bp_end = false; ++ int ret = 0; ++ ++ bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, ++ start, 0, 1, 0); ++ bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, ++ bucket_pos_to_bp(trans->c, start, 0), 0, 1, 0); ++ while (1) { ++ alloc_k = !alloc_end ++ ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) ++ : bkey_s_c_null; ++ bp_k = !bp_end ++ ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) ++ : bkey_s_c_null; ++ ++ ret = bkey_err(alloc_k) ?: bkey_err(bp_k); ++ if ((!alloc_k.k && !bp_k.k) || ret) { ++ *end = SPOS_MAX; ++ break; ++ } ++ ++ --btree_nodes; ++ if (!btree_nodes) { ++ *end = alloc_k.k->p; ++ break; ++ } ++ ++ if (bpos_cmp(alloc_iter.pos, SPOS_MAX) && ++ bpos_cmp(bucket_pos_to_bp(trans->c, alloc_iter.pos, 0), bp_iter.pos) < 0) { ++ if (!bch2_btree_iter_advance(&alloc_iter)) ++ alloc_end = true; ++ } else { ++ if (!bch2_btree_iter_advance(&bp_iter)) ++ bp_end = true; ++ } ++ } ++ bch2_trans_iter_exit(trans, &bp_iter); ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ return ret; ++} ++ ++int bch2_check_extents_to_backpointers(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct bpos start = POS_MIN, end; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ while (1) { ++ ret = bch2_get_alloc_in_memory_pos(&trans, start, &end); ++ if (ret) ++ break; ++ ++ if (!bpos_cmp(start, POS_MIN) && bpos_cmp(end, SPOS_MAX)) ++ bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", ++ __func__, btree_nodes_fit_in_ram(c)); ++ ++ if (bpos_cmp(start, POS_MIN) || bpos_cmp(end, SPOS_MAX)) { ++ struct printbuf buf = PRINTBUF; ++ ++ prt_str(&buf, "check_extents_to_backpointers(): "); ++ bch2_bpos_to_text(&buf, start); ++ prt_str(&buf, "-"); ++ bch2_bpos_to_text(&buf, end); ++ ++ bch_verbose(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ ++ ret = bch2_check_extents_to_backpointers_pass(&trans, start, end); ++ if (ret || !bpos_cmp(end, SPOS_MAX)) ++ break; ++ ++ start = bpos_successor(end); ++ } + bch2_trans_exit(&trans); ++ + return ret; +} + +static int check_one_backpointer(struct btree_trans *trans, + struct bpos bucket, -+ u64 *bp_offset) ++ u64 *bp_offset, ++ struct bbpos start, ++ struct bbpos end) +{ + struct btree_iter iter; + struct bch_backpointer bp; ++ struct bbpos pos; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + -+ ret = bch2_get_next_backpointer(trans, bucket, -1, -+ bp_offset, &bp); ++ ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0); + if (ret || *bp_offset == U64_MAX) + return ret; + ++ pos = bp_to_bbpos(bp); ++ if (bbpos_cmp(pos, start) < 0 || ++ bbpos_cmp(pos, end) > 0) ++ return 0; ++ + k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); + ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) @@ -6204,42 +6447,87 @@ index 000000000000..955f3ee96cc0 + return ret; +} + -+int bch2_check_backpointers_to_extents(struct bch_fs *c) ++static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, ++ struct bbpos start, ++ struct bbpos end) +{ -+ struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + -+ bch2_trans_init(&trans, c, 0, 0); -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + u64 bp_offset = 0; + -+ while (!(ret = commit_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_one_backpointer(&trans, iter.pos, &bp_offset))) && ++ while (!(ret = commit_do(trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) && + bp_offset < U64_MAX) + bp_offset++; + + if (ret) + break; + } -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); ++ bch2_trans_iter_exit(trans, &iter); + return ret < 0 ? ret : 0; +} ++ ++int bch2_check_backpointers_to_extents(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ while (1) { ++ ret = bch2_get_btree_in_memory_pos(&trans, ++ (1U << BTREE_ID_extents)| ++ (1U << BTREE_ID_reflink), ++ ~0, ++ start, &end); ++ if (ret) ++ break; ++ ++ if (!bbpos_cmp(start, BBPOS_MIN) && ++ bbpos_cmp(end, BBPOS_MAX)) ++ bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", ++ __func__, btree_nodes_fit_in_ram(c)); ++ ++ if (bbpos_cmp(start, BBPOS_MIN) || ++ bbpos_cmp(end, BBPOS_MAX)) { ++ struct printbuf buf = PRINTBUF; ++ ++ prt_str(&buf, "check_backpointers_to_extents(): "); ++ bch2_bbpos_to_text(&buf, start); ++ prt_str(&buf, "-"); ++ bch2_bbpos_to_text(&buf, end); ++ ++ bch_verbose(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ ++ ret = bch2_check_backpointers_to_extents_pass(&trans, start, end); ++ if (ret || !bbpos_cmp(end, BBPOS_MAX)) ++ break; ++ ++ start = bbpos_successor(end); ++ } ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h new file mode 100644 -index 000000000000..fe42af296e9c +index 000000000000..48a48b75c0ac --- /dev/null +++ b/fs/bcachefs/backpointers.h -@@ -0,0 +1,38 @@ +@@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H + ++#include "buckets.h" +#include "super.h" + +int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, @@ -6248,22 +6536,41 @@ index 000000000000..fe42af296e9c +void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_backpointer_swab(struct bkey_s); + -+#define bch2_bkey_ops_backpointer (struct bkey_ops) { \ ++#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ + .key_invalid = bch2_backpointer_invalid, \ + .val_to_text = bch2_backpointer_k_to_text, \ + .swab = bch2_backpointer_swab, \ -+} ++}) + -+void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned, -+ struct bkey_s_c, struct extent_ptr_decoded, -+ struct bpos *, struct bch_backpointer *); ++#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 ++ ++static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, struct extent_ptr_decoded p, ++ struct bpos *bucket_pos, struct bch_backpointer *bp) ++{ ++ enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user; ++ s64 sectors = level ? btree_sectors(c) : k.k->size; ++ u32 bucket_offset; ++ ++ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); ++ *bp = (struct bch_backpointer) { ++ .btree_id = btree_id, ++ .level = level, ++ .data_type = data_type, ++ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + ++ p.crc.offset, ++ .bucket_len = ptr_disk_sectors(sectors, p), ++ .pos = k.k->p, ++ }; ++} + +int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *, + struct bch_backpointer, struct bkey_s_c); +int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *, + struct bch_backpointer, struct bkey_s_c); +int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, -+ u64 *, struct bch_backpointer *); ++ u64 *, struct bch_backpointer *, unsigned); +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, + struct bpos, u64, struct bch_backpointer); +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, @@ -6274,12 +6581,66 @@ index 000000000000..fe42af296e9c +int bch2_check_backpointers_to_extents(struct bch_fs *); + +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ +diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h +new file mode 100644 +index 000000000000..1fbed1f8378d +--- /dev/null ++++ b/fs/bcachefs/bbpos.h +@@ -0,0 +1,48 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BBPOS_H ++#define _BCACHEFS_BBPOS_H ++ ++#include "bkey_methods.h" ++ ++struct bbpos { ++ enum btree_id btree; ++ struct bpos pos; ++}; ++ ++static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) ++{ ++ return (struct bbpos) { btree, pos }; ++} ++ ++#define BBPOS_MIN BBPOS(0, POS_MIN) ++#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) ++ ++static inline int bbpos_cmp(struct bbpos l, struct bbpos r) ++{ ++ return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos); ++} ++ ++static inline struct bbpos bbpos_successor(struct bbpos pos) ++{ ++ if (bpos_cmp(pos.pos, SPOS_MAX)) { ++ pos.pos = bpos_successor(pos.pos); ++ return pos; ++ } ++ ++ if (pos.btree != BTREE_ID_NR) { ++ pos.btree++; ++ pos.pos = POS_MIN; ++ return pos; ++ } ++ ++ BUG(); ++} ++ ++static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) ++{ ++ prt_str(out, bch2_btree_ids[pos.btree]); ++ prt_char(out, ':'); ++ bch2_bpos_to_text(out, pos.pos); ++} ++ ++#endif /* _BCACHEFS_BBPOS_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 -index 000000000000..ccac2a3fcdf7 +index 000000000000..d90effeb06a7 --- /dev/null +++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,1001 @@ +@@ -0,0 +1,1019 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H @@ -6389,7 +6750,7 @@ index 000000000000..ccac2a3fcdf7 + * + * BTREE NODES: + * -+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and ++ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and + * free smaller than a bucket - so, that's how big our btree nodes are. + * + * (If buckets are really big we'll only use part of the bucket for a btree node @@ -6564,7 +6925,7 @@ index 000000000000..ccac2a3fcdf7 + "When reading btree nodes, read all replicas and " \ + "compare them") + -+/* Parameters that should only be compiled in in debug mode: */ ++/* Parameters that should only be compiled in debug mode: */ +#define BCH_DEBUG_PARAMS_DEBUG() \ + BCH_DEBUG_PARAM(expensive_debug_checks, \ + "Enables various runtime debugging checks that " \ @@ -6880,6 +7241,23 @@ index 000000000000..ccac2a3fcdf7 +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + ++#define BCH_BTREE_WRITE_TYPES() \ ++ x(initial, 0) \ ++ x(init_next_bset, 1) \ ++ x(cache_reclaim, 2) \ ++ x(journal_reclaim, 3) \ ++ x(interior, 4) ++ ++enum btree_write_type { ++#define x(t, n) BTREE_WRITE_##t, ++ BCH_BTREE_WRITE_TYPES() ++#undef x ++ BTREE_WRITE_TYPE_NR, ++}; ++ ++#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) ++#define BTREE_WRITE_TYPE_BITS ilog2(BTREE_WRITE_TYPE_MASK) ++ +struct bch_fs { + struct closure cl; + @@ -6989,6 +7367,13 @@ index 000000000000..ccac2a3fcdf7 + struct workqueue_struct *btree_interior_update_worker; + struct work_struct btree_interior_update_work; + ++ /* btree_io.c: */ ++ spinlock_t btree_write_error_lock; ++ struct btree_write_stats { ++ atomic64_t nr; ++ atomic64_t bytes; ++ } btree_write_stats[BTREE_WRITE_TYPE_NR]; ++ + /* btree_iter.c: */ + struct mutex btree_trans_lock; + struct list_head btree_trans_list; @@ -7163,11 +7548,6 @@ index 000000000000..ccac2a3fcdf7 + struct bio_set dio_write_bioset; + struct bio_set dio_read_bioset; + -+ -+ atomic64_t btree_writes_nr; -+ atomic64_t btree_writes_sectors; -+ spinlock_t btree_write_error_lock; -+ + /* ERRORS */ + struct list_head fsck_errors; + struct mutex fsck_error_lock; @@ -7212,7 +7592,6 @@ index 000000000000..ccac2a3fcdf7 + + struct time_stats times[BCH_TIME_STAT_NR]; + -+ const char *btree_transaction_fns[BCH_TRANSACTIONS_NR]; + struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; +}; + @@ -7283,10 +7662,10 @@ index 000000000000..ccac2a3fcdf7 +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 -index 000000000000..9e10fc8301f0 +index 000000000000..5da9f3a4d47d --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,2122 @@ +@@ -0,0 +1,2172 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H @@ -7436,7 +7815,7 @@ index 000000000000..9e10fc8301f0 +#else +#error edit for your odd byteorder. +#endif -+} __attribute__((packed, aligned(4))); ++} __packed __aligned(4); + +#define KEY_INODE_MAX ((__u64)~0ULL) +#define KEY_OFFSET_MAX ((__u64)~0ULL) @@ -7470,7 +7849,7 @@ index 000000000000..9e10fc8301f0 + __u32 hi; + __u64 lo; +#endif -+} __attribute__((packed, aligned(4))); ++} __packed __aligned(4); + +struct bkey { + /* Size of combined key and value, in u64s */ @@ -7503,7 +7882,7 @@ index 000000000000..9e10fc8301f0 + + __u8 pad[1]; +#endif -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bkey_packed { + __u64 _data[0]; @@ -7537,7 +7916,7 @@ index 000000000000..9e10fc8301f0 + * to the same size as struct bkey should hopefully be safest. + */ + __u8 pad[sizeof(struct bkey) - 3]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) +#define BKEY_U64s_MAX U8_MAX @@ -7625,7 +8004,7 @@ index 000000000000..9e10fc8301f0 + * number. + * + * - WHITEOUT: for hash table btrees -+*/ ++ */ +#define BCH_BKEY_TYPES() \ + x(deleted, 0) \ + x(whiteout, 1) \ @@ -7655,7 +8034,8 @@ index 000000000000..9e10fc8301f0 + x(set, 25) \ + x(lru, 26) \ + x(alloc_v4, 27) \ -+ x(backpointer, 28) ++ x(backpointer, 28) \ ++ x(inode_v3, 29) + +enum bch_bkey_type { +#define x(name, nr) KEY_TYPE_##name = nr, @@ -7766,7 +8146,7 @@ index 000000000000..9e10fc8301f0 +struct bch_csum { + __le64 lo; + __le64 hi; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ @@ -7803,7 +8183,7 @@ index 000000000000..9e10fc8301f0 + _compressed_size:7, + type:2; +#endif -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 @@ -7829,7 +8209,7 @@ index 000000000000..9e10fc8301f0 + type:3; +#endif + __u64 csum_lo; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) @@ -7853,7 +8233,7 @@ index 000000000000..9e10fc8301f0 + type:4; +#endif + struct bch_csum csum; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) @@ -7879,7 +8259,7 @@ index 000000000000..9e10fc8301f0 + cached:1, + type:1; +#endif -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_extent_stripe_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) @@ -7931,7 +8311,7 @@ index 000000000000..9e10fc8301f0 + + __u64 _data[0]; + struct bch_extent_ptr start[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_btree_ptr_v2 { + struct bch_val v; @@ -7943,7 +8323,7 @@ index 000000000000..9e10fc8301f0 + struct bpos min_key; + __u64 _data[0]; + struct bch_extent_ptr start[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + @@ -7952,7 +8332,7 @@ index 000000000000..9e10fc8301f0 + + __u64 _data[0]; + union bch_extent_entry start[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_reservation { + struct bch_val v; @@ -7960,7 +8340,7 @@ index 000000000000..9e10fc8301f0 + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ @@ -7994,7 +8374,7 @@ index 000000000000..9e10fc8301f0 + __le32 bi_flags; + __le16 bi_mode; + __u8 fields[0]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_inode_v2 { + struct bch_val v; @@ -8004,20 +8384,35 @@ index 000000000000..9e10fc8301f0 + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[0]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); ++ ++struct bch_inode_v3 { ++ struct bch_val v; ++ ++ __le64 bi_journal_seq; ++ __le64 bi_hash_seed; ++ __le64 bi_flags; ++ __le64 bi_sectors; ++ __le64 bi_size; ++ __le64 bi_version; ++ __u8 fields[0]; ++} __packed __aligned(8); ++ ++#define INODEv3_FIELDS_START_INITIAL 6 ++#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(u64)) + +struct bch_inode_generation { + struct bch_val v; + + __le32 bi_generation; + __le32 pad; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + -+#define BCH_INODE_FIELDS() \ ++#define BCH_INODE_FIELDS_v2() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ @@ -8044,6 +8439,31 @@ index 000000000000..9e10fc8301f0 + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + ++#define BCH_INODE_FIELDS_v3() \ ++ x(bi_atime, 96) \ ++ x(bi_ctime, 96) \ ++ x(bi_mtime, 96) \ ++ x(bi_otime, 96) \ ++ x(bi_uid, 32) \ ++ x(bi_gid, 32) \ ++ x(bi_nlink, 32) \ ++ x(bi_generation, 32) \ ++ x(bi_dev, 32) \ ++ x(bi_data_checksum, 8) \ ++ x(bi_compression, 8) \ ++ x(bi_project, 32) \ ++ x(bi_background_compression, 8) \ ++ x(bi_data_replicas, 8) \ ++ x(bi_promote_target, 16) \ ++ x(bi_foreground_target, 16) \ ++ x(bi_background_target, 16) \ ++ x(bi_erasure_code, 16) \ ++ x(bi_fields_set, 16) \ ++ x(bi_dir, 64) \ ++ x(bi_dir_offset, 64) \ ++ x(bi_subvol, 32) \ ++ x(bi_parent_subvol, 32) ++ +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ @@ -8069,16 +8489,16 @@ index 000000000000..9e10fc8301f0 + * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL + * flags) + */ -+ __BCH_INODE_SYNC = 0, -+ __BCH_INODE_IMMUTABLE = 1, -+ __BCH_INODE_APPEND = 2, -+ __BCH_INODE_NODUMP = 3, -+ __BCH_INODE_NOATIME = 4, ++ __BCH_INODE_SYNC = 0, ++ __BCH_INODE_IMMUTABLE = 1, ++ __BCH_INODE_APPEND = 2, ++ __BCH_INODE_NODUMP = 3, ++ __BCH_INODE_NOATIME = 4, + -+ __BCH_INODE_I_SIZE_DIRTY= 5, -+ __BCH_INODE_I_SECTORS_DIRTY= 6, -+ __BCH_INODE_UNLINKED = 7, -+ __BCH_INODE_BACKPTR_UNTRUSTED = 8, ++ __BCH_INODE_I_SIZE_DIRTY = 5, ++ __BCH_INODE_I_SECTORS_DIRTY = 6, ++ __BCH_INODE_UNLINKED = 7, ++ __BCH_INODE_BACKPTR_UNTRUSTED = 8, + + /* bits 20+ reserved for packed fields below: */ +}; @@ -8100,6 +8520,13 @@ index 000000000000..9e10fc8301f0 +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + ++LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); ++LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); ++ ++LE64_BITMASK(INODEv3_FIELDS_START, ++ struct bch_inode_v3, bi_flags, 31, 36); ++LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); ++ +/* Dirents */ + +/* @@ -8132,7 +8559,7 @@ index 000000000000..9e10fc8301f0 + __u8 d_type; + + __u8 d_name[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 @@ -8155,7 +8582,7 @@ index 000000000000..9e10fc8301f0 + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* Bucket/allocation information: */ + @@ -8164,7 +8591,7 @@ index 000000000000..9e10fc8301f0 + __u8 fields; + __u8 gen; + __u8 data[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ @@ -8189,7 +8616,7 @@ index 000000000000..9e10fc8301f0 + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ @@ -8208,7 +8635,7 @@ index 000000000000..9e10fc8301f0 + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) @@ -8226,7 +8653,7 @@ index 000000000000..9e10fc8301f0 + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64)) @@ -8246,7 +8673,7 @@ index 000000000000..9e10fc8301f0 + __u64 bucket_offset:40; + __u32 bucket_len; + struct bpos pos; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* Quotas: */ + @@ -8271,7 +8698,7 @@ index 000000000000..9e10fc8301f0 +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* Erasure coding */ + @@ -8287,7 +8714,7 @@ index 000000000000..9e10fc8301f0 + __u8 pad; + + struct bch_extent_ptr ptrs[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* Reflink: */ + @@ -8304,14 +8731,14 @@ index 000000000000..9e10fc8301f0 + */ + __le32 front_pad; + __le32 back_pad; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[0]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_indirect_inline_data { + struct bch_val v; @@ -8368,7 +8795,7 @@ index 000000000000..9e10fc8301f0 +struct bch_lru { + struct bch_val v; + __le64 idx; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define LRU_ID_STRIPES (1U << 16) + @@ -8567,19 +8994,19 @@ index 000000000000..9e10fc8301f0 + __u8 data_type; + __u8 nr_devs; + __u8 devs[]; -+} __attribute__((packed)); ++} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_replicas_entry { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[]; -+} __attribute__((packed)); ++} __packed; + +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) @@ -8587,7 +9014,7 @@ index 000000000000..9e10fc8301f0 +struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry entries[0]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ + @@ -8604,7 +9031,7 @@ index 000000000000..9e10fc8301f0 +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* BCH_SB_FIELD_disk_groups: */ + @@ -8613,7 +9040,7 @@ index 000000000000..9e10fc8301f0 +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) @@ -8622,7 +9049,7 @@ index 000000000000..9e10fc8301f0 +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[0]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* BCH_SB_FIELD_counters */ + @@ -8783,7 +9210,8 @@ index 000000000000..9e10fc8301f0 + x(freespace, 19) \ + x(alloc_v4, 20) \ + x(new_data_types, 21) \ -+ x(backpointers, 22) ++ x(backpointers, 22) \ ++ x(inode_v3, 23) + +enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, @@ -8805,7 +9233,7 @@ index 000000000000..9e10fc8301f0 + __u8 nr_superblocks; + __u8 pad[5]; + __le64 sb_offset[61]; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#define BCH_SB_LAYOUT_SECTOR 7 + @@ -8856,7 +9284,7 @@ index 000000000000..9e10fc8301f0 + struct bch_sb_field start[0]; + __le64 _data[0]; + }; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +/* + * Flags: @@ -9139,6 +9567,7 @@ index 000000000000..9e10fc8301f0 +static inline __le64 __bch2_sb_magic(struct bch_sb *sb) +{ + __le64 ret; ++ + memcpy(&ret, &sb->uuid, sizeof(ret)); + return ret; +} @@ -9213,26 +9642,26 @@ index 000000000000..9e10fc8301f0 +struct jset_entry_usage { + struct jset_entry entry; + __le64 v; -+} __attribute__((packed)); ++} __packed; + +struct jset_entry_data_usage { + struct jset_entry entry; + __le64 v; + struct bch_replicas_entry r; -+} __attribute__((packed)); ++} __packed; + +struct jset_entry_clock { + struct jset_entry entry; + __u8 rw; + __u8 pad[7]; + __le64 time; -+} __attribute__((packed)); ++} __packed; + +struct jset_entry_dev_usage_type { + __le64 buckets; + __le64 sectors; + __le64 fragmented; -+} __attribute__((packed)); ++} __packed; + +struct jset_entry_dev_usage { + struct jset_entry entry; @@ -9243,7 +9672,7 @@ index 000000000000..9e10fc8301f0 + __le64 _buckets_unavailable; /* No longer used */ + + struct jset_entry_dev_usage_type d[]; -+} __attribute__((packed)); ++} __packed; + +static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) +{ @@ -9254,7 +9683,7 @@ index 000000000000..9e10fc8301f0 +struct jset_entry_log { + struct jset_entry entry; + u8 d[]; -+} __attribute__((packed)); ++} __packed; + +/* + * On disk format for a journal entry: @@ -9289,7 +9718,7 @@ index 000000000000..9e10fc8301f0 + struct jset_entry start[0]; + __u64 _data[0]; + }; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); +LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); @@ -9352,7 +9781,7 @@ index 000000000000..9e10fc8301f0 + struct bkey_packed start[0]; + __u64 _data[0]; + }; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); + @@ -9385,7 +9814,7 @@ index 000000000000..9e10fc8301f0 + + }; + }; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); +LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); @@ -9406,12 +9835,12 @@ index 000000000000..9e10fc8301f0 + + }; + }; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +#endif /* _BCACHEFS_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h new file mode 100644 -index 000000000000..b2edabf58260 +index 000000000000..ad47a506a907 --- /dev/null +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -0,0 +1,368 @@ @@ -9625,7 +10054,7 @@ index 000000000000..b2edabf58260 + __u64 pad[8]; + }; + }; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +enum bch_data_event { + BCH_DATA_EVENT_PROGRESS = 0, @@ -9641,7 +10070,7 @@ index 000000000000..b2edabf58260 + + __u64 sectors_done; + __u64 sectors_total; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_ioctl_data_event { + __u8 type; @@ -9650,12 +10079,12 @@ index 000000000000..b2edabf58260 + struct bch_ioctl_data_progress p; + __u64 pad2[15]; + }; -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + +struct bch_replicas_usage { + __u64 sectors; + struct bch_replicas_entry r; -+} __attribute__((packed)); ++} __packed; + +static inline struct bch_replicas_usage * +replicas_usage_next(struct bch_replicas_usage *u) @@ -9785,14 +10214,15 @@ index 000000000000..b2edabf58260 +#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c new file mode 100644 -index 000000000000..d348175edad4 +index 000000000000..630df060fbe9 --- /dev/null +++ b/fs/bcachefs/bkey.c -@@ -0,0 +1,1203 @@ +@@ -0,0 +1,1098 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey.h" ++#include "bkey_cmp.h" +#include "bkey_methods.h" +#include "bset.h" +#include "util.h" @@ -9807,9 +10237,6 @@ index 000000000000..d348175edad4 + +const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; + -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, -+ const struct bkey_packed *); -+ +void bch2_bkey_packed_to_binary_text(struct printbuf *out, + const struct bkey_format *f, + const struct bkey_packed *k) @@ -10554,50 +10981,6 @@ index 000000000000..d348175edad4 + +#ifdef CONFIG_X86_64 + -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ long d0, d1, d2, d3; -+ int cmp; -+ -+ /* we shouldn't need asm for this, but gcc is being retarded: */ -+ -+ asm(".intel_syntax noprefix;" -+ "xor eax, eax;" -+ "xor edx, edx;" -+ "1:;" -+ "mov r8, [rdi];" -+ "mov r9, [rsi];" -+ "sub ecx, 64;" -+ "jl 2f;" -+ -+ "cmp r8, r9;" -+ "jnz 3f;" -+ -+ "lea rdi, [rdi - 8];" -+ "lea rsi, [rsi - 8];" -+ "jmp 1b;" -+ -+ "2:;" -+ "not ecx;" -+ "shr r8, 1;" -+ "shr r9, 1;" -+ "shr r8, cl;" -+ "shr r9, cl;" -+ "cmp r8, r9;" -+ -+ "3:\n" -+ "seta al;" -+ "setb dl;" -+ "sub eax, edx;" -+ ".att_syntax prefix;" -+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) -+ : "0" (l), "1" (r), "3" (nr_key_bits) -+ : "r8", "r9", "cc", "memory"); -+ -+ return cmp; -+} -+ +#define I(_x) (*(out)++ = (_x)) +#define I1(i0) I(i0) +#define I2(i0, i1) (I1(i0), I(i1)) @@ -10828,40 +11211,6 @@ index 000000000000..d348175edad4 +} + +#else -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ u64 l_v, r_v; -+ -+ if (!nr_key_bits) -+ return 0; -+ -+ /* for big endian, skip past header */ -+ nr_key_bits += high_bit_offset; -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (1) { -+ if (nr_key_bits < 64) { -+ l_v >>= 64 - nr_key_bits; -+ r_v >>= 64 - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= 64; -+ } -+ -+ if (!nr_key_bits || l_v != r_v) -+ break; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ } -+ -+ return cmp_int(l_v, r_v); -+} +#endif + +__pure @@ -10869,19 +11218,7 @@ index 000000000000..d348175edad4 + const struct bkey_packed *r, + const struct btree *b) +{ -+ const struct bkey_format *f = &b->format; -+ int ret; -+ -+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ ret = __bkey_cmp_bits(high_word(f, l), -+ high_word(f, r), -+ b->nr_key_bits); -+ -+ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), -+ bkey_unpack_pos(b, r))); -+ return ret; ++ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); +} + +__pure __flatten @@ -10897,20 +11234,7 @@ index 000000000000..d348175edad4 + const struct bkey_packed *l, + const struct bkey_packed *r) +{ -+ struct bkey unpacked; -+ -+ if (likely(bkey_packed(l) && bkey_packed(r))) -+ return __bch2_bkey_cmp_packed_format_checked(l, r, b); -+ -+ if (bkey_packed(l)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, l); -+ l = (void*) &unpacked; -+ } else if (bkey_packed(r)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, r); -+ r = (void*) &unpacked; -+ } -+ -+ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); ++ return bch2_bkey_cmp_packed_inlined(b, l, r); +} + +__pure __flatten @@ -10994,10 +11318,10 @@ index 000000000000..d348175edad4 +#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h new file mode 100644 -index 000000000000..df9fb859d1db +index 000000000000..19b59ffe0a98 --- /dev/null +++ b/fs/bcachefs/bkey.h -@@ -0,0 +1,571 @@ +@@ -0,0 +1,666 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_H +#define _BCACHEFS_BKEY_H @@ -11005,6 +11329,7 @@ index 000000000000..df9fb859d1db +#include +#include "bcachefs_format.h" + ++#include "btree_types.h" +#include "util.h" +#include "vstructs.h" + @@ -11134,8 +11459,9 @@ index 000000000000..df9fb859d1db +} + +/* -+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to -+ * pass it by by val... as much as I hate c++, const ref would be nice here: ++ * The compiler generates better code when we pass bpos by ref, but it's often ++ * enough terribly convenient to pass it by val... as much as I hate c++, const ++ * ref would be nice here: + */ +__pure __flatten +static inline int bkey_cmp_left_packed_byval(const struct btree *b, @@ -11356,6 +11682,99 @@ index 000000000000..df9fb859d1db +bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, + const struct bkey_format *); + ++typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); ++ ++static inline void ++__bkey_unpack_key_format_checked(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++ if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { ++ compiled_unpack_fn unpack_fn = b->aux_data; ++ unpack_fn(dst, src); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && ++ bch2_expensive_debug_checks) { ++ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); ++ ++ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); ++ } ++ } else { ++ *dst = __bch2_bkey_unpack_key(&b->format, src); ++ } ++} ++ ++static inline struct bkey ++bkey_unpack_key_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ struct bkey dst; ++ ++ __bkey_unpack_key_format_checked(b, &dst, src); ++ return dst; ++} ++ ++static inline void __bkey_unpack_key(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++ if (likely(bkey_packed(src))) ++ __bkey_unpack_key_format_checked(b, dst, src); ++ else ++ *dst = *packed_to_bkey_c(src); ++} ++ ++/** ++ * bkey_unpack_key -- unpack just the key, not the value ++ */ ++static inline struct bkey bkey_unpack_key(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_key_format_checked(b, src) ++ : *packed_to_bkey_c(src); ++} ++ ++static inline struct bpos ++bkey_unpack_pos_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ return bkey_unpack_key_format_checked(b, src).p; ++#else ++ return __bkey_unpack_pos(&b->format, src); ++#endif ++} ++ ++static inline struct bpos bkey_unpack_pos(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_pos_format_checked(b, src) ++ : packed_to_bkey_c(src)->p; ++} ++ ++/* Disassembled bkeys */ ++ ++static inline struct bkey_s_c bkey_disassemble(struct btree *b, ++ const struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; ++} ++ ++/* non const version: */ ++static inline struct bkey_s __bkey_disassemble(struct btree *b, ++ struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; ++} ++ +static inline u64 bkey_field_max(const struct bkey_format *f, + enum bch_bkey_fields nr) +{ @@ -11571,15 +11990,16 @@ index 000000000000..df9fb859d1db +#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h new file mode 100644 -index 000000000000..0d7c67a959af +index 000000000000..a30c4ae8eb36 --- /dev/null +++ b/fs/bcachefs/bkey_buf.h -@@ -0,0 +1,60 @@ +@@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_BUF_H +#define _BCACHEFS_BKEY_BUF_H + +#include "bcachefs.h" ++#include "bkey.h" + +struct bkey_buf { + struct bkey_i *k; @@ -11635,12 +12055,147 @@ index 000000000000..0d7c67a959af +} + +#endif /* _BCACHEFS_BKEY_BUF_H */ +diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h +new file mode 100644 +index 000000000000..5f42a6e69360 +--- /dev/null ++++ b/fs/bcachefs/bkey_cmp.h +@@ -0,0 +1,129 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_CMP_H ++#define _BCACHEFS_BKEY_CMP_H ++ ++#include "bkey.h" ++ ++#ifdef CONFIG_X86_64 ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ long d0, d1, d2, d3; ++ int cmp; ++ ++ /* we shouldn't need asm for this, but gcc is being retarded: */ ++ ++ asm(".intel_syntax noprefix;" ++ "xor eax, eax;" ++ "xor edx, edx;" ++ "1:;" ++ "mov r8, [rdi];" ++ "mov r9, [rsi];" ++ "sub ecx, 64;" ++ "jl 2f;" ++ ++ "cmp r8, r9;" ++ "jnz 3f;" ++ ++ "lea rdi, [rdi - 8];" ++ "lea rsi, [rsi - 8];" ++ "jmp 1b;" ++ ++ "2:;" ++ "not ecx;" ++ "shr r8, 1;" ++ "shr r9, 1;" ++ "shr r8, cl;" ++ "shr r9, cl;" ++ "cmp r8, r9;" ++ ++ "3:\n" ++ "seta al;" ++ "setb dl;" ++ "sub eax, edx;" ++ ".att_syntax prefix;" ++ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) ++ : "0" (l), "1" (r), "3" (nr_key_bits) ++ : "r8", "r9", "cc", "memory"); ++ ++ return cmp; ++} ++#else ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ u64 l_v, r_v; ++ ++ if (!nr_key_bits) ++ return 0; ++ ++ /* for big endian, skip past header */ ++ nr_key_bits += high_bit_offset; ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (1) { ++ if (nr_key_bits < 64) { ++ l_v >>= 64 - nr_key_bits; ++ r_v >>= 64 - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= 64; ++ } ++ ++ if (!nr_key_bits || l_v != r_v) ++ break; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ } ++ ++ return cmp_int(l_v, r_v); ++} ++#endif ++ ++static inline __pure __flatten ++int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ int ret; ++ ++ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ ret = __bkey_cmp_bits(high_word(f, l), ++ high_word(f, r), ++ b->nr_key_bits); ++ ++ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), ++ bkey_unpack_pos(b, r))); ++ return ret; ++} ++ ++static inline __pure __flatten ++int bch2_bkey_cmp_packed_inlined(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) ++{ ++ struct bkey unpacked; ++ ++ if (likely(bkey_packed(l) && bkey_packed(r))) ++ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); ++ ++ if (bkey_packed(l)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, l); ++ l = (void *) &unpacked; ++ } else if (bkey_packed(r)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, r); ++ r = (void *) &unpacked; ++ } ++ ++ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); ++} ++ ++#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 -index 000000000000..e0cbac8811af +index 000000000000..6939d74d705e --- /dev/null +++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,503 @@ +@@ -0,0 +1,505 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -11672,13 +12227,13 @@ index 000000000000..e0cbac8811af + return 0; +} + -+#define bch2_bkey_ops_deleted (struct bkey_ops) { \ ++#define bch2_bkey_ops_deleted ((struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ -+} ++}) + -+#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ ++#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ -+} ++}) + +static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) @@ -11692,9 +12247,9 @@ index 000000000000..e0cbac8811af + return 0; +} + -+#define bch2_bkey_ops_error (struct bkey_ops) { \ ++#define bch2_bkey_ops_error ((struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ -+} ++}) + +static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) @@ -11708,13 +12263,13 @@ index 000000000000..e0cbac8811af + return 0; +} + -+#define bch2_bkey_ops_cookie (struct bkey_ops) { \ ++#define bch2_bkey_ops_cookie ((struct bkey_ops) { \ + .key_invalid = key_type_cookie_invalid, \ -+} ++}) + -+#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) { \ ++#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ + .key_invalid = empty_val_key_invalid, \ -+} ++}) + +static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) @@ -11732,10 +12287,10 @@ index 000000000000..e0cbac8811af + datalen, min(datalen, 32U), d.v->data); +} + -+#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ ++#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ + .key_invalid = key_type_inline_data_invalid, \ + .val_to_text = key_type_inline_data_to_text, \ -+} ++}) + +static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err) @@ -11755,10 +12310,10 @@ index 000000000000..e0cbac8811af + return true; +} + -+#define bch2_bkey_ops_set (struct bkey_ops) { \ ++#define bch2_bkey_ops_set ((struct bkey_ops) { \ + .key_invalid = key_type_set_invalid, \ + .key_merge = key_type_set_merge, \ -+} ++}) + +const struct bkey_ops bch2_bkey_ops[] = { +#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, @@ -11792,6 +12347,7 @@ index 000000000000..e0cbac8811af + (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_inode)| + (1U << KEY_TYPE_inode_v2)| ++ (1U << KEY_TYPE_inode_v3)| + (1U << KEY_TYPE_inode_generation), + [BKEY_TYPE_dirents] = + (1U << KEY_TYPE_deleted)| @@ -12082,6 +12638,7 @@ index 000000000000..e0cbac8811af + btree_id == BTREE_ID_inodes) { + if (!bkey_packed(k)) { + struct bkey_i *u = packed_to_bkey(k); ++ + swap(u->k.p.inode, u->k.p.offset); + } else if (f->bits_per_field[BKEY_FIELD_INODE] && + f->bits_per_field[BKEY_FIELD_OFFSET]) { @@ -12146,7 +12703,7 @@ index 000000000000..e0cbac8811af +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h new file mode 100644 -index 000000000000..db894b40d2ca +index 000000000000..4739b3c32cff --- /dev/null +++ b/fs/bcachefs/bkey_methods.h @@ -0,0 +1,175 @@ @@ -12170,7 +12727,7 @@ index 000000000000..db894b40d2ca + * + * When invalid, error string is returned via @err. @rw indicates whether key is + * being read or written; more aggressive checks can be enabled when rw == WRITE. -+*/ ++ */ +struct bkey_ops { + int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, + int rw, struct printbuf *err); @@ -12327,13 +12884,14 @@ index 000000000000..db894b40d2ca +#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c new file mode 100644 -index 000000000000..b1385a77da11 +index 000000000000..be0d4bc1afd3 --- /dev/null +++ b/fs/bcachefs/bkey_sort.c -@@ -0,0 +1,198 @@ +@@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" ++#include "bkey_cmp.h" +#include "bkey_sort.h" +#include "bset.h" +#include "extents.h" @@ -12488,7 +13046,7 @@ index 000000000000..b1385a77da11 + struct bkey_packed *l, + struct bkey_packed *r) +{ -+ return bch2_bkey_cmp_packed(b, l, r) ?: ++ return bch2_bkey_cmp_packed_inlined(b, l, r) ?: + (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; +} @@ -12510,7 +13068,7 @@ index 000000000000..b1385a77da11 + continue; + + while ((next = sort_iter_peek(iter)) && -+ !bch2_bkey_cmp_packed(iter->b, in, next)) { ++ !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); + needs_whiteout |= in->needs_whiteout; @@ -12581,10 +13139,10 @@ index 000000000000..79cf11d1b4e7 +#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c new file mode 100644 -index 000000000000..fa60ef84e4ef +index 000000000000..094235364470 --- /dev/null +++ b/fs/bcachefs/bset.c -@@ -0,0 +1,1598 @@ +@@ -0,0 +1,1601 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for working with individual keys, and sorted sets of keys with in a @@ -13552,7 +14110,7 @@ index 000000000000..fa60ef84e4ef + t->size -= j - l; + + for (j = l; j < t->size; j++) -+ rw_aux_tree(b, t)[j].offset += shift; ++ rw_aux_tree(b, t)[j].offset += shift; + + EBUG_ON(l < t->size && + rw_aux_tree(b, t)[l].offset == @@ -13853,7 +14411,7 @@ index 000000000000..fa60ef84e4ef + bch2_btree_node_iter_sort(iter, b); +} + -+noinline __flatten __attribute__((cold)) ++noinline __flatten __cold +static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, + struct btree *b, struct bpos *search) +{ @@ -14028,7 +14586,10 @@ index 000000000000..fa60ef84e4ef + EBUG_ON(iter->data->k > iter->data->end); + + if (unlikely(__btree_node_iter_set_end(iter, 0))) { -+ bch2_btree_node_iter_set_drop(iter, iter->data); ++ /* avoid an expensive memmove call: */ ++ iter->data[0] = iter->data[1]; ++ iter->data[1] = iter->data[2]; ++ iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; + return; + } + @@ -14185,10 +14746,10 @@ index 000000000000..fa60ef84e4ef +} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h new file mode 100644 -index 000000000000..0d46534c3dcd +index 000000000000..72e6376bce2a --- /dev/null +++ b/fs/bcachefs/bset.h -@@ -0,0 +1,615 @@ +@@ -0,0 +1,521 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BSET_H +#define _BCACHEFS_BSET_H @@ -14396,100 +14957,6 @@ index 000000000000..0d46534c3dcd + return btree_aux_data_bytes(b) / sizeof(u64); +} + -+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); -+ -+static inline void -+__bkey_unpack_key_format_checked(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ { -+ compiled_unpack_fn unpack_fn = b->aux_data; -+ unpack_fn(dst, src); -+ -+ if (bch2_expensive_debug_checks) { -+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); -+ -+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); -+ } -+ } -+#else -+ *dst = __bch2_bkey_unpack_key(&b->format, src); -+#endif -+} -+ -+static inline struct bkey -+bkey_unpack_key_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ struct bkey dst; -+ -+ __bkey_unpack_key_format_checked(b, &dst, src); -+ return dst; -+} -+ -+static inline void __bkey_unpack_key(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+ if (likely(bkey_packed(src))) -+ __bkey_unpack_key_format_checked(b, dst, src); -+ else -+ *dst = *packed_to_bkey_c(src); -+} -+ -+/** -+ * bkey_unpack_key -- unpack just the key, not the value -+ */ -+static inline struct bkey bkey_unpack_key(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_key_format_checked(b, src) -+ : *packed_to_bkey_c(src); -+} -+ -+static inline struct bpos -+bkey_unpack_pos_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ return bkey_unpack_key_format_checked(b, src).p; -+#else -+ return __bkey_unpack_pos(&b->format, src); -+#endif -+} -+ -+static inline struct bpos bkey_unpack_pos(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_pos_format_checked(b, src) -+ : packed_to_bkey_c(src)->p; -+} -+ -+/* Disassembled bkeys */ -+ -+static inline struct bkey_s_c bkey_disassemble(struct btree *b, -+ const struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -+} -+ -+/* non const version: */ -+static inline struct bkey_s __bkey_disassemble(struct btree *b, -+ struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -+} -+ +#define for_each_bset(_b, _t) \ + for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + @@ -14806,10 +15273,10 @@ index 000000000000..0d46534c3dcd +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 -index 000000000000..f84b50869de2 +index 000000000000..75e744792a92 --- /dev/null +++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1149 @@ +@@ -0,0 +1,1204 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -14826,6 +15293,12 @@ index 000000000000..f84b50869de2 +#include +#include + ++#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ ++do { \ ++ if (shrinker_counter) \ ++ bc->not_freed_##counter++; \ ++} while (0) ++ +const char * const bch2_btree_node_flags[] = { +#define x(f) #f, + BTREE_FLAGS() @@ -14924,7 +15397,9 @@ index 000000000000..f84b50869de2 + +static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) +{ -+ struct btree *b = kzalloc(sizeof(struct btree), gfp); ++ struct btree *b; ++ ++ b = kzalloc(sizeof(struct btree), gfp); + if (!b) + return NULL; + @@ -14942,7 +15417,9 @@ index 000000000000..f84b50869de2 +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; -+ struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); ++ struct btree *b; ++ ++ b = __btree_node_mem_alloc(c, GFP_KERNEL); + if (!b) + return NULL; + @@ -14961,6 +15438,7 @@ index 000000000000..f84b50869de2 +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +{ + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); ++ + BUG_ON(ret); + + /* Cause future lookups for this node to fail: */ @@ -14987,7 +15465,7 @@ index 000000000000..f84b50869de2 + mutex_lock(&bc->lock); + ret = __bch2_btree_node_hash_insert(bc, b); + if (!ret) -+ list_add(&b->list, &bc->live); ++ list_add_tail(&b->list, &bc->live); + mutex_unlock(&bc->lock); + + return ret; @@ -15006,7 +15484,7 @@ index 000000000000..f84b50869de2 + * this version is for btree nodes that have already been freed (we're not + * reaping a real btree node) + */ -+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) +{ + struct btree_cache *bc = &c->btree_cache; + int ret = 0; @@ -15016,38 +15494,64 @@ index 000000000000..f84b50869de2 + if (b->flags & ((1U << BTREE_NODE_dirty)| + (1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { -+ if (!flush) ++ if (!flush) { ++ if (btree_node_dirty(b)) ++ BTREE_CACHE_NOT_FREED_INCREMENT(dirty); ++ else if (btree_node_read_in_flight(b)) ++ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); ++ else if (btree_node_write_in_flight(b)) ++ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + return -ENOMEM; ++ } + + /* XXX: waiting on IO with btree cache lock held */ + bch2_btree_node_wait_on_read(b); + bch2_btree_node_wait_on_write(b); + } + -+ if (!six_trylock_intent(&b->c.lock)) ++ if (!six_trylock_intent(&b->c.lock)) { ++ BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); + return -ENOMEM; ++ } + -+ if (!six_trylock_write(&b->c.lock)) ++ if (!six_trylock_write(&b->c.lock)) { ++ BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); + goto out_unlock_intent; ++ } + + /* recheck under lock */ + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { -+ if (!flush) ++ if (!flush) { ++ if (btree_node_read_in_flight(b)) ++ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); ++ else if (btree_node_write_in_flight(b)) ++ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + goto out_unlock; ++ } + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + -+ if (btree_node_noevict(b) || -+ btree_node_write_blocked(b) || -+ btree_node_will_make_reachable(b)) ++ if (btree_node_noevict(b)) { ++ BTREE_CACHE_NOT_FREED_INCREMENT(noevict); + goto out_unlock; ++ } ++ if (btree_node_write_blocked(b)) { ++ BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); ++ goto out_unlock; ++ } ++ if (btree_node_will_make_reachable(b)) { ++ BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); ++ goto out_unlock; ++ } + + if (btree_node_dirty(b)) { -+ if (!flush) ++ if (!flush) { ++ BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + goto out_unlock; ++ } + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted @@ -15055,9 +15559,11 @@ index 000000000000..f84b50869de2 + * the post write cleanup: + */ + if (bch2_verify_btree_ondisk) -+ bch2_btree_node_write(c, b, SIX_LOCK_intent, 0); ++ bch2_btree_node_write(c, b, SIX_LOCK_intent, ++ BTREE_WRITE_cache_reclaim); + else -+ __bch2_btree_node_write(c, b, 0); ++ __bch2_btree_node_write(c, b, ++ BTREE_WRITE_cache_reclaim); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); @@ -15075,14 +15581,14 @@ index 000000000000..f84b50869de2 + goto out; +} + -+static int btree_node_reclaim(struct bch_fs *c, struct btree *b) ++static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) +{ -+ return __btree_node_reclaim(c, b, false); ++ return __btree_node_reclaim(c, b, false, shrinker_counter); +} + +static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) +{ -+ return __btree_node_reclaim(c, b, true); ++ return __btree_node_reclaim(c, b, true, false); +} + +static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -15131,11 +15637,12 @@ index 000000000000..f84b50869de2 + if (touched >= nr) + goto out; + -+ if (!btree_node_reclaim(c, b)) { ++ if (!btree_node_reclaim(c, b, true)) { + btree_node_data_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + freed++; ++ bc->freed++; + } + } +restart: @@ -15144,9 +15651,11 @@ index 000000000000..f84b50869de2 + + if (btree_node_accessed(b)) { + clear_btree_node_accessed(b); -+ } else if (!btree_node_reclaim(c, b)) { ++ bc->not_freed_access_bit++; ++ } else if (!btree_node_reclaim(c, b, true)) { + freed++; + btree_node_data_free(c, b); ++ bc->freed++; + + bch2_btree_node_hash_remove(bc, b); + six_unlock_write(&b->c.lock); @@ -15161,7 +15670,7 @@ index 000000000000..f84b50869de2 + six_trylock_read(&b->c.lock)) { + list_move(&bc->live, &b->list); + mutex_unlock(&bc->lock); -+ __bch2_btree_node_write(c, b, 0); ++ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); + six_unlock_read(&b->c.lock); + if (touched >= nr) + goto out_nounlock; @@ -15202,7 +15711,7 @@ index 000000000000..f84b50869de2 + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_cache.shrink); + -+ bch2_btree_cache_to_text(out, c); ++ bch2_btree_cache_to_text(out, &c->btree_cache); +} + +void bch2_fs_btree_cache_exit(struct bch_fs *c) @@ -15360,7 +15869,7 @@ index 000000000000..f84b50869de2 + struct btree *b; + + list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_reclaim(c, b)) ++ if (!btree_node_reclaim(c, b, false)) + return b; + + while (1) { @@ -15395,7 +15904,7 @@ index 000000000000..f84b50869de2 + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, freed, list) -+ if (!btree_node_reclaim(c, b)) { ++ if (!btree_node_reclaim(c, b, false)) { + list_del_init(&b->list); + goto got_node; + } @@ -15421,7 +15930,7 @@ index 000000000000..f84b50869de2 + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b2, &bc->freeable, list) -+ if (!btree_node_reclaim(c, b2)) { ++ if (!btree_node_reclaim(c, b2, false)) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); @@ -15447,6 +15956,7 @@ index 000000000000..f84b50869de2 + b->flags = 0; + b->written = 0; + b->nsets = 0; ++ b->write_type = 0; + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; @@ -15642,7 +16152,7 @@ index 000000000000..f84b50869de2 + if (likely(c->opts.btree_node_mem_ptr_optimization && + b && + b->hash_val == btree_ptr_hash_val(k))) -+ goto lock_node; ++ goto lock_node; +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { @@ -15882,7 +16392,7 @@ index 000000000000..f84b50869de2 + + /* XXX we're called from btree_gc which will be holding other btree + * nodes locked -+ * */ ++ */ + __bch2_btree_node_wait_on_read(b); + __bch2_btree_node_wait_on_write(b); + @@ -15890,7 +16400,7 @@ index 000000000000..f84b50869de2 + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + + if (btree_node_dirty(b)) { -+ __bch2_btree_node_write(c, b, 0); ++ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; @@ -15953,24 +16463,37 @@ index 000000000000..f84b50869de2 + stats.failed); +} + -+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) ++void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc) +{ -+ prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); -+ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); -+ prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); ++ prt_printf(out, "nr nodes:\t\t%u\n", bc->used); ++ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty)); ++ prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); ++ ++ prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed); ++ prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty); ++ prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight); ++ prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight); ++ prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent); ++ prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write); ++ prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit); ++ prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict); ++ prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked); ++ prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable); ++ +} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h new file mode 100644 -index 000000000000..a4df3e866bb8 +index 000000000000..b623c7028273 --- /dev/null +++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,105 @@ +@@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_CACHE_H +#define _BCACHEFS_BTREE_CACHE_H + +#include "bcachefs.h" +#include "btree_types.h" ++#include "bkey_methods.h" + +extern const char * const bch2_btree_node_flags[]; + @@ -16067,12 +16590,12 @@ index 000000000000..a4df3e866bb8 + +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, + struct btree *); -+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *); ++void bch2_btree_cache_to_text(struct printbuf *, struct btree_cache *); + +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 -index 000000000000..5b7f7cd3252a +index 000000000000..20e804ecb104 --- /dev/null +++ b/fs/bcachefs/btree_gc.c @@ -0,0 +1,2106 @@ @@ -16277,7 +16800,7 @@ index 000000000000..5b7f7cd3252a + struct bkey_i_btree_ptr_v2 *new; + int ret; + -+ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL); ++ new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); + if (!new) + return -ENOMEM; + @@ -16306,7 +16829,7 @@ index 000000000000..5b7f7cd3252a + if (ret) + return ret; + -+ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL); ++ new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); + if (!new) + return -ENOMEM; + @@ -16396,7 +16919,7 @@ index 000000000000..5b7f7cd3252a + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1.buf, buf2.buf)) -+ ret = set_node_min(c, cur, expected_start); ++ ret = set_node_min(c, cur, expected_start); + } +out: +fsck_err: @@ -18044,7 +18567,7 @@ index 000000000000..5b7f7cd3252a + } + + for (i = 0; i < BTREE_ID_NR; i++) -+ if ((1 << i) & BTREE_ID_HAS_PTRS) { ++ if (btree_type_has_ptrs(i)) { + struct btree_iter iter; + struct bkey_s_c k; + @@ -18057,10 +18580,10 @@ index 000000000000..5b7f7cd3252a + NULL, NULL, + BTREE_INSERT_NOFAIL, + gc_btree_gens_key(&trans, &iter, k)); -+ if (ret) { ++ if (ret && ret != -EROFS) + bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); ++ if (ret) + goto err; -+ } + } + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, @@ -18070,10 +18593,10 @@ index 000000000000..5b7f7cd3252a + NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_alloc_write_oldest_gen(&trans, &iter, k)); -+ if (ret) { ++ if (ret && ret != -EROFS) + bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); ++ if (ret) + goto err; -+ } + + c->gc_gens_btree = 0; + c->gc_gens_pos = POS_MIN; @@ -18302,10 +18825,10 @@ index 000000000000..95d803b5743d +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 -index 000000000000..13ce29750d28 +index 000000000000..cee3b500d45b --- /dev/null +++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,2154 @@ +@@ -0,0 +1,2203 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -18759,6 +19282,24 @@ index 000000000000..13ce29750d28 +} + +/* ++ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? ++ * ++ * The first bset is going to be of similar order to the size of the node, the ++ * last bset is bounded by btree_write_set_buffer(), which is set to keep the ++ * memmove on insert from being too expensive: the middle bset should, ideally, ++ * be the geometric mean of the first and the last. ++ * ++ * Returns true if the middle bset is greater than that geometric mean: ++ */ ++static inline bool should_compact_all(struct bch_fs *c, struct btree *b) ++{ ++ unsigned mid_u64s_bits = ++ (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; ++ ++ return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; ++} ++ ++/* + * @bch_btree_init_next - initialize a new (unwritten) bset that can then be + * inserted into + * @@ -18775,19 +19316,14 @@ index 000000000000..13ce29750d28 + + EBUG_ON(!(b->c.lock.state.seq & 1)); + BUG_ON(bset_written(b, bset(b, &b->set[1]))); ++ BUG_ON(btree_node_just_written(b)); + + if (b->nsets == MAX_BSETS && -+ !btree_node_write_in_flight(b)) { -+ unsigned log_u64s[] = { -+ ilog2(bset_u64s(&b->set[0])), -+ ilog2(bset_u64s(&b->set[1])), -+ ilog2(bset_u64s(&b->set[2])), -+ }; -+ -+ if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { -+ bch2_btree_node_write(c, b, SIX_LOCK_write, 0); -+ reinit_iter = true; -+ } ++ !btree_node_write_in_flight(b) && ++ should_compact_all(c, b)) { ++ bch2_btree_node_write(c, b, SIX_LOCK_write, ++ BTREE_WRITE_init_next_bset); ++ reinit_iter = true; + } + + if (b->nsets == MAX_BSETS && @@ -19524,6 +20060,7 @@ index 000000000000..13ce29750d28 + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ + bch2_latency_acct(ca, rb->start_time, READ); + } + @@ -19711,6 +20248,7 @@ index 000000000000..13ce29750d28 + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ + bch2_latency_acct(ca, rb->start_time, READ); + } + @@ -19959,7 +20497,7 @@ index 000000000000..13ce29750d28 + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + if (new & (1U << BTREE_NODE_write_in_flight)) -+ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); ++ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type); + else + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} @@ -20108,6 +20646,7 @@ index 000000000000..13ce29750d28 + bool used_mempool; + unsigned long old, new; + bool validate_before_checksum = false; ++ enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; + void *data; + int ret; + @@ -20154,6 +20693,12 @@ index 000000000000..13ce29750d28 + if (new & (1U << BTREE_NODE_need_write)) + return; +do_write: ++ if ((flags & BTREE_WRITE_ONLY_IF_NEED)) ++ type = b->write_type; ++ b->write_type = 0; ++ ++ BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); ++ + atomic_dec(&c->btree_cache.dirty); + + BUG_ON(btree_node_fake(b)); @@ -20221,6 +20766,8 @@ index 000000000000..13ce29750d28 + u64s = bch2_sort_keys(i->start, &sort_iter, false); + le16_add_cpu(&i->u64s, u64s); + ++ BUG_ON(!b->written && i->u64s != b->data->keys.u64s); ++ + set_needs_whiteout(i, false); + + /* do we have data to write? */ @@ -20230,6 +20777,10 @@ index 000000000000..13ce29750d28 + bytes_to_write = vstruct_end(i) - data; + sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; + ++ if (!b->written && ++ b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); ++ + memset(data + bytes_to_write, 0, + (sectors_to_write << 9) - bytes_to_write); + @@ -20318,27 +20869,18 @@ index 000000000000..13ce29750d28 + + b->written += sectors_to_write; + -+ if (wbio->wbio.first_btree_write && -+ b->key.k.type == KEY_TYPE_btree_ptr_v2) -+ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = -+ cpu_to_le16(b->written); -+ + if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = + cpu_to_le16(b->written); + -+ atomic64_inc(&c->btree_writes_nr); -+ atomic64_add(sectors_to_write, &c->btree_writes_sectors); ++ atomic64_inc(&c->btree_write_stats[type].nr); ++ atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); + + INIT_WORK(&wbio->work, btree_write_submit); + queue_work(c->io_complete_wq, &wbio->work); + return; +err: + set_btree_node_noevict(b); -+ if (!b->written && -+ b->key.k.type == KEY_TYPE_btree_ptr_v2) -+ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = -+ cpu_to_le16(sectors_to_write); + b->written += sectors_to_write; +nowrite: + btree_bounce_free(c, bytes, used_mempool, data); @@ -20460,12 +21002,42 @@ index 000000000000..13ce29750d28 +{ + return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); +} ++ ++const char * const bch2_btree_write_types[] = { ++#define x(t, n) [n] = #t, ++ BCH_BTREE_WRITE_TYPES() ++ NULL ++}; ++ ++void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ printbuf_tabstop_push(out, 20); ++ printbuf_tabstop_push(out, 10); ++ ++ prt_tab(out); ++ prt_str(out, "nr"); ++ prt_tab(out); ++ prt_str(out, "size"); ++ prt_newline(out); ++ ++ for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { ++ u64 nr = atomic64_read(&c->btree_write_stats[i].nr); ++ u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); ++ ++ prt_printf(out, "%s:", bch2_btree_write_types[i]); ++ prt_tab(out); ++ prt_u64(out, nr); ++ prt_tab(out); ++ prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); ++ prt_newline(out); ++ } ++} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h new file mode 100644 -index 000000000000..8af853642123 +index 000000000000..4b1810ad7d91 --- /dev/null +++ b/fs/bcachefs/btree_io.h -@@ -0,0 +1,222 @@ +@@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_IO_H +#define _BCACHEFS_BTREE_IO_H @@ -20607,8 +21179,12 @@ index 000000000000..8af853642123 + +bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + -+#define BTREE_WRITE_ONLY_IF_NEED (1U << 0) -+#define BTREE_WRITE_ALREADY_STARTED (1U << 1) ++enum btree_write_flags { ++ __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, ++ __BTREE_WRITE_ALREADY_STARTED, ++}; ++#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED ) ++#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED) + +void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); +void bch2_btree_node_write(struct bch_fs *, struct btree *, @@ -20687,13 +21263,15 @@ index 000000000000..8af853642123 + bn->min_key = bpos_nosnap_successor(bn->min_key); +} + ++void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); ++ +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 -index 000000000000..925ffb318445 +index 000000000000..72a3f400f82b --- /dev/null +++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,3043 @@ +@@ -0,0 +1,3121 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -20718,6 +21296,8 @@ index 000000000000..925ffb318445 + +static void btree_trans_verify_sorted(struct btree_trans *); +inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); ++static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *, ++ struct btree_path *, int); + +static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, @@ -20908,6 +21488,7 @@ index 000000000000..925ffb318445 + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); ++ + bch2_bkey_to_text(&buf2, &uk); + } else { + prt_printf(&buf2, "(none)"); @@ -20915,6 +21496,7 @@ index 000000000000..925ffb318445 + + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); ++ + bch2_bkey_to_text(&buf3, &uk); + } else { + prt_printf(&buf3, "(none)"); @@ -21468,7 +22050,7 @@ index 000000000000..925ffb318445 + + bch2_bkey_buf_init(&tmp); + -+ while (nr && !ret) { ++ while (nr-- && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + @@ -21503,7 +22085,7 @@ index 000000000000..925ffb318445 + + bch2_bkey_buf_init(&tmp); + -+ while (nr && !ret) { ++ while (nr-- && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + @@ -21700,14 +22282,9 @@ index 000000000000..925ffb318445 + return ret; +} + -+static inline bool btree_path_good_node(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned l, int check_pos) ++static inline bool btree_path_check_pos_in_node(struct btree_path *path, ++ unsigned l, int check_pos) +{ -+ if (!is_btree_node(path, l) || -+ !bch2_btree_node_relock(trans, path, l)) -+ return false; -+ + if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) + return false; + if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) @@ -21715,6 +22292,15 @@ index 000000000000..925ffb318445 + return true; +} + ++static inline bool btree_path_good_node(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned l, int check_pos) ++{ ++ return is_btree_node(path, l) && ++ bch2_btree_node_relock(trans, path, l) && ++ btree_path_check_pos_in_node(path, l, check_pos); ++} ++ +static void btree_path_set_level_down(struct btree_trans *trans, + struct btree_path *path, + unsigned new_level) @@ -21731,9 +22317,9 @@ index 000000000000..925ffb318445 + bch2_btree_path_verify(trans, path); +} + -+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, -+ struct btree_path *path, -+ int check_pos) ++static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, ++ struct btree_path *path, ++ int check_pos) +{ + unsigned i, l = path->level; +again: @@ -21754,6 +22340,16 @@ index 000000000000..925ffb318445 + return l; +} + ++static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, ++ struct btree_path *path, ++ int check_pos) ++{ ++ return likely(btree_node_locked(path, path->level) && ++ btree_path_check_pos_in_node(path, path->level, check_pos)) ++ ? path->level ++ : __btree_path_up_until_good_node(trans, path, check_pos); ++} ++ +/* + * This is the main state machine for walking down the btree - walks down to a + * specified depth @@ -21850,7 +22446,7 @@ index 000000000000..925ffb318445 + btree_path_traverse_one(trans, path, flags, _RET_IP_); +} + -+static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, ++static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, + struct btree_path *src) +{ + unsigned i, offset = offsetof(struct btree_path, pos); @@ -21859,12 +22455,12 @@ index 000000000000..925ffb318445 + (void *) src + offset, + sizeof(struct btree_path) - offset); + -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) -+ if (btree_node_locked(dst, i)) -+ six_lock_increment(&dst->l[i].b->c.lock, -+ __btree_lock_want(dst, i)); ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) { ++ unsigned t = btree_node_locked_type(dst, i); + -+ bch2_btree_path_check_sort(trans, dst, 0); ++ if (t != BTREE_NODE_UNLOCKED) ++ six_lock_increment(&dst->l[i].b->c.lock, t); ++ } +} + +static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, @@ -21877,44 +22473,36 @@ index 000000000000..925ffb318445 + return new; +} + -+inline struct btree_path * __must_check -+bch2_btree_path_make_mut(struct btree_trans *trans, ++__flatten ++struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent, + unsigned long ip) +{ -+ if (path->ref > 1 || path->preserve) { -+ __btree_path_put(path, intent); -+ path = btree_path_clone(trans, path, intent); -+ path->preserve = false; ++ __btree_path_put(path, intent); ++ path = btree_path_clone(trans, path, intent); ++ path->preserve = false; +#ifdef CONFIG_BCACHEFS_DEBUG -+ path->ip_allocated = ip; ++ path->ip_allocated = ip; +#endif -+ btree_trans_verify_sorted(trans); -+ } -+ -+ path->should_be_locked = false; ++ btree_trans_verify_sorted(trans); + return path; +} + +struct btree_path * __must_check -+bch2_btree_path_set_pos(struct btree_trans *trans, ++__bch2_btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, -+ bool intent, unsigned long ip) ++ bool intent, unsigned long ip, int cmp) +{ -+ int cmp = bpos_cmp(new_pos, path->pos); + unsigned l = path->level; + + EBUG_ON(trans->restarted); + EBUG_ON(!path->ref); + -+ if (!cmp) -+ return path; -+ + path = bch2_btree_path_make_mut(trans, path, intent, ip); + + path->pos = new_pos; + -+ bch2_btree_path_check_sort(trans, path, cmp); ++ bch2_btree_path_check_sort_fast(trans, path, cmp); + + if (unlikely(path->cached)) { + btree_node_unlock(trans, path, 0); @@ -21938,7 +22526,7 @@ index 000000000000..925ffb318445 + __btree_path_level_init(path, l); + } + -+ if (l != path->level) { ++ if (unlikely(l != path->level)) { + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + __bch2_btree_path_unlock(trans, path); + } @@ -22016,6 +22604,18 @@ index 000000000000..925ffb318445 + __bch2_path_free(trans, path); +} + ++static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, ++ bool intent) ++{ ++ EBUG_ON(trans->paths + path->idx != path); ++ EBUG_ON(!path->ref); ++ ++ if (!__btree_path_put(path, intent)) ++ return; ++ ++ __bch2_path_free(trans, path); ++} ++ +void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) +{ + struct btree_insert_entry *i; @@ -22224,15 +22824,17 @@ index 000000000000..925ffb318445 +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) +{ + ++ struct btree_path_level *l = path_l(path); ++ struct bkey_packed *_k; + struct bkey_s_c k; + ++ if (unlikely(!l->b)) ++ return bkey_s_c_null; ++ + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + EBUG_ON(!btree_node_locked(path, path->level)); + + if (!path->cached) { -+ struct btree_path_level *l = path_l(path); -+ struct bkey_packed *_k; -+ + _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; + @@ -22464,7 +23066,8 @@ index 000000000000..925ffb318445 + if (bpos_cmp(start_pos, iter->journal_pos) < 0) + iter->journal_idx = 0; + -+ k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0, ++ k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, ++ iter->path->level, + start_pos, end_pos, + &iter->journal_idx); + @@ -22486,7 +23089,7 @@ index 000000000000..925ffb318445 +{ + struct bkey_i *next_journal = + bch2_btree_journal_peek(trans, iter, iter->path->pos, -+ k.k ? k.k->p : iter->path->l[0].b->key.k.p); ++ k.k ? k.k->p : path_l(iter->path)->b->key.k.p); + + if (next_journal) { + iter->k = next_journal->k; @@ -22546,10 +23149,12 @@ index 000000000000..925ffb318445 + struct bkey_s_c k, k2; + int ret; + -+ EBUG_ON(iter->path->cached || iter->path->level); ++ EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + + while (1) { ++ struct btree_path_level *l; ++ + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); @@ -22562,9 +23167,18 @@ index 000000000000..925ffb318445 + goto out; + } + ++ l = path_l(iter->path); ++ ++ if (unlikely(!l->b)) { ++ /* No btree nodes at requested level: */ ++ bch2_btree_iter_set_pos(iter, SPOS_MAX); ++ k = bkey_s_c_null; ++ goto out; ++ } ++ + btree_path_set_should_be_locked(iter->path); + -+ k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); ++ k = btree_path_level_peek_all(trans->c, l, &iter->k); + + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + k.k && @@ -22585,7 +23199,7 @@ index 000000000000..925ffb318445 + : NULL; + if (next_update && + bpos_cmp(next_update->k.p, -+ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { ++ k.k ? k.k->p : l->b->key.k.p) <= 0) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + } @@ -22606,9 +23220,9 @@ index 000000000000..925ffb318445 + + if (likely(k.k)) { + break; -+ } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { ++ } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ -+ search_key = bpos_successor(iter->path->l[0].b->key.k.p); ++ search_key = bpos_successor(l->b->key.k.p); + } else { + /* End of btree: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); @@ -22637,8 +23251,8 @@ index 000000000000..925ffb318445 + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + + if (iter->update_path) { -+ bch2_path_put(trans, iter->update_path, -+ iter->flags & BTREE_ITER_INTENT); ++ bch2_path_put_nokeep(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + @@ -22669,8 +23283,8 @@ index 000000000000..925ffb318445 + + if (iter->update_path && + bkey_cmp(iter->update_path->pos, k.k->p)) { -+ bch2_path_put(trans, iter->update_path, -+ iter->flags & BTREE_ITER_INTENT); ++ bch2_path_put_nokeep(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + @@ -22918,7 +23532,7 @@ index 000000000000..925ffb318445 + * that candidate + */ + if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { -+ bch2_path_put(trans, iter->path, ++ bch2_path_put_nokeep(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + iter->path = saved_path; + saved_path = NULL; @@ -22931,7 +23545,7 @@ index 000000000000..925ffb318445 + iter->snapshot, + k.k->p.snapshot)) { + if (saved_path) -+ bch2_path_put(trans, saved_path, ++ bch2_path_put_nokeep(trans, saved_path, + iter->flags & BTREE_ITER_INTENT); + saved_path = btree_path_clone(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); @@ -22975,7 +23589,7 @@ index 000000000000..925ffb318445 + btree_path_set_should_be_locked(iter->path); +out_no_locked: + if (saved_path) -+ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); ++ bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); @@ -23057,6 +23671,8 @@ index 000000000000..925ffb318445 + } + + k = bch2_btree_path_peek_slot(iter->path, &iter->k); ++ if (unlikely(!k.k)) ++ goto out_no_locked; + } else { + struct bpos next; + @@ -23187,6 +23803,35 @@ index 000000000000..925ffb318445 + btree_path_verify_sorted_ref(trans, r); +} + ++static inline struct btree_path *sib_btree_path(struct btree_trans *trans, ++ struct btree_path *path, int sib) ++{ ++ unsigned idx = (unsigned) path->sorted_idx + sib; ++ ++ EBUG_ON(sib != -1 && sib != 1); ++ ++ return idx < trans->nr_sorted ++ ? trans->paths + trans->sorted[idx] ++ : NULL; ++} ++ ++static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans, ++ struct btree_path *path, ++ int cmp) ++{ ++ struct btree_path *n; ++ int cmp2; ++ ++ EBUG_ON(!cmp); ++ ++ while ((n = sib_btree_path(trans, path, cmp)) && ++ (cmp2 = btree_path_cmp(n, path)) && ++ cmp2 != cmp) ++ btree_path_swap(trans, n, path); ++ ++ btree_trans_verify_sorted(trans); ++} ++ +inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, + int cmp) +{ @@ -23263,7 +23908,7 @@ index 000000000000..925ffb318445 + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + if (iter->update_path) -+ bch2_path_put(trans, iter->update_path, ++ bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + if (iter->key_cache_path) + bch2_path_put(trans, iter->key_cache_path, @@ -23281,7 +23926,7 @@ index 000000000000..925ffb318445 + unsigned flags, + unsigned long ip) +{ -+ if (trans->restarted) ++ if (unlikely(trans->restarted)) + panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); @@ -23301,7 +23946,7 @@ index 000000000000..925ffb318445 + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; + -+ if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) ++ if (trans->journal_replay_not_finished) + flags |= BTREE_ITER_WITH_JOURNAL; + + iter->trans = trans; @@ -23454,7 +24099,7 @@ index 000000000000..925ffb318445 + + if (!trans->restarted && + (need_resched() || -+ ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { ++ local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + bch2_trans_unlock(trans); + cond_resched(); + bch2_trans_relock(trans); @@ -23464,7 +24109,7 @@ index 000000000000..925ffb318445 + if (trans->restarted) + bch2_btree_path_traverse_all(trans); + -+ trans->last_begin_time = ktime_get_ns(); ++ trans->last_begin_time = local_clock(); + return trans->restart_count; +} + @@ -23485,7 +24130,7 @@ index 000000000000..925ffb318445 + BUG_ON(trans->used_mempool); + +#ifdef __KERNEL__ -+ p = this_cpu_xchg(c->btree_paths_bufs->path , NULL); ++ p = this_cpu_xchg(c->btree_paths_bufs->path, NULL); +#endif + if (!p) + p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); @@ -23494,15 +24139,16 @@ index 000000000000..925ffb318445 + trans->updates = p; p += updates_bytes; +} + -+static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c, -+ const char *fn) ++const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; ++ ++unsigned bch2_trans_get_fn_idx(const char *fn) +{ + unsigned i; + -+ for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++) -+ if (!c->btree_transaction_fns[i] || -+ c->btree_transaction_fns[i] == fn) { -+ c->btree_transaction_fns[i] = fn; ++ for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) ++ if (!bch2_btree_transaction_fns[i] || ++ bch2_btree_transaction_fns[i] == fn) { ++ bch2_btree_transaction_fns[i] = fn; + return i; + } + @@ -23510,7 +24156,7 @@ index 000000000000..925ffb318445 + return i; +} + -+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn) ++void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx) + __acquires(&c->btree_trans_barrier) +{ + struct btree_transaction_stats *s; @@ -23520,16 +24166,19 @@ index 000000000000..925ffb318445 + + memset(trans, 0, sizeof(*trans)); + trans->c = c; -+ trans->fn = fn; -+ trans->last_begin_time = ktime_get_ns(); -+ trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn); ++ trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) ++ ? bch2_btree_transaction_fns[fn_idx] : NULL; ++ trans->last_begin_time = local_clock(); ++ trans->fn_idx = fn_idx; + trans->locking_wait.task = current; ++ trans->journal_replay_not_finished = ++ !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + closure_init_stack(&trans->ref); + + bch2_trans_alloc_paths(trans, c); + + s = btree_trans_stats(trans); -+ if (s) { ++ if (s && s->max_mem) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); @@ -23540,9 +24189,9 @@ index 000000000000..925ffb318445 + } else { + trans->mem_bytes = expected_mem_bytes; + } -+ -+ trans->nr_max_paths = s->nr_max_paths; + } ++ if (s) ++ trans->nr_max_paths = s->nr_max_paths; + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + @@ -23648,7 +24297,7 @@ index 000000000000..925ffb318445 + + rcu_read_lock(); + owner = READ_ONCE(b->lock.owner); -+ pid = owner ? owner->pid : 0;; ++ pid = owner ? owner->pid : 0; + rcu_read_unlock(); + + prt_tab(out); @@ -23710,6 +24359,13 @@ index 000000000000..925ffb318445 + +void bch2_fs_btree_iter_exit(struct bch_fs *c) +{ ++ struct btree_transaction_stats *s; ++ ++ for (s = c->btree_transaction_stats; ++ s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); ++ s++) ++ kfree(s->max_paths_text); ++ + if (c->btree_trans_barrier_initialized) + cleanup_srcu_struct(&c->btree_trans_barrier); + mempool_exit(&c->btree_trans_mem_pool); @@ -23739,10 +24395,10 @@ index 000000000000..925ffb318445 +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 -index 000000000000..910f6d7bc961 +index 000000000000..8c35d7d45d8e --- /dev/null +++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,564 @@ +@@ -0,0 +1,599 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H @@ -23876,12 +24532,36 @@ index 000000000000..910f6d7bc961 + _path = __trans_next_path_with_node((_trans), (_b), \ + (_path)->idx + 1)) + -+struct btree_path * __must_check -+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, ++struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, + bool, unsigned long); ++ ++static inline struct btree_path * __must_check ++bch2_btree_path_make_mut(struct btree_trans *trans, ++ struct btree_path *path, bool intent, ++ unsigned long ip) ++{ ++ if (path->ref > 1 || path->preserve) ++ path = __bch2_btree_path_make_mut(trans, path, intent, ip); ++ path->should_be_locked = false; ++ return path; ++} ++ +struct btree_path * __must_check -+bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, -+ struct bpos, bool, unsigned long); ++__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, ++ struct bpos, bool, unsigned long, int); ++ ++static inline struct btree_path * __must_check ++bch2_btree_path_set_pos(struct btree_trans *trans, ++ struct btree_path *path, struct bpos new_pos, ++ bool intent, unsigned long ip) ++{ ++ int cmp = bpos_cmp(new_pos, path->pos); ++ ++ return cmp ++ ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp) ++ : path; ++} ++ +int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); +struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, @@ -24296,10 +24976,21 @@ index 000000000000..910f6d7bc961 +void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); +void bch2_dump_trans_updates(struct btree_trans *); +void bch2_dump_trans_paths_updates(struct btree_trans *); -+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, const char *); ++void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned); +void bch2_trans_exit(struct btree_trans *); + -+#define bch2_trans_init(_trans, _c, _nr_iters, _mem) __bch2_trans_init(_trans, _c, __func__) ++extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; ++unsigned bch2_trans_get_fn_idx(const char *); ++ ++#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \ ++do { \ ++ static unsigned trans_fn_idx; \ ++ \ ++ if (unlikely(!trans_fn_idx)) \ ++ trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ ++ \ ++ __bch2_trans_init(_trans, _c, trans_fn_idx); \ ++} while (0) + +void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); + @@ -24309,10 +25000,11 @@ index 000000000000..910f6d7bc961 +#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 -index 000000000000..35e941949f49 +index 000000000000..66fb69801318 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,983 @@ +@@ -0,0 +1,1034 @@ ++// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" @@ -24418,15 +25110,34 @@ index 000000000000..35e941949f49 + six_unlock_intent(&ck->c.lock); +} + ++#ifdef __KERNEL__ ++static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, ++ struct bkey_cached *ck) ++{ ++ struct bkey_cached *pos; ++ ++ list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { ++ if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, ++ pos->btree_trans_barrier_seq)) { ++ list_move(&ck->list, &pos->list); ++ return; ++ } ++ } ++ ++ list_move(&ck->list, &bc->freed_nonpcpu); ++} ++#endif ++ +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ -+ struct btree_key_cache_freelist *f; -+ bool freed = false; -+ + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + if (!ck->c.lock.readers) { ++#ifdef __KERNEL__ ++ struct btree_key_cache_freelist *f; ++ bool freed = false; ++ + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + @@ -24444,13 +25155,18 @@ index 000000000000..35e941949f49 + while (f->nr > ARRAY_SIZE(f->objs) / 2) { + struct bkey_cached *ck2 = f->objs[--f->nr]; + -+ list_move_tail(&ck2->list, &bc->freed_nonpcpu); ++ __bkey_cached_move_to_freelist_ordered(bc, ck2); + } + preempt_enable(); + -+ list_move_tail(&ck->list, &bc->freed_nonpcpu); ++ __bkey_cached_move_to_freelist_ordered(bc, ck); + mutex_unlock(&bc->lock); + } ++#else ++ mutex_lock(&bc->lock); ++ list_move_tail(&ck->list, &bc->freed_nonpcpu); ++ mutex_unlock(&bc->lock); ++#endif + } else { + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_pcpu); @@ -24485,10 +25201,12 @@ index 000000000000..35e941949f49 + struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck = NULL; -+ struct btree_key_cache_freelist *f; + bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); + + if (!pcpu_readers) { ++#ifdef __KERNEL__ ++ struct btree_key_cache_freelist *f; ++ + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + if (f->nr) @@ -24511,6 +25229,14 @@ index 000000000000..35e941949f49 + preempt_enable(); + mutex_unlock(&bc->lock); + } ++#else ++ mutex_lock(&bc->lock); ++ if (!list_empty(&bc->freed_nonpcpu)) { ++ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); ++ list_del_init(&ck->list); ++ } ++ mutex_unlock(&bc->lock); ++#endif + } else { + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_pcpu)) { @@ -24543,6 +25269,7 @@ index 000000000000..35e941949f49 + return ck; + } + ++ /* GFP_NOFS because we're holding btree locks: */ + ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); + if (likely(ck)) { + INIT_LIST_HEAD(&ck->list); @@ -24567,6 +25294,7 @@ index 000000000000..35e941949f49 + struct bkey_cached *ck; + unsigned i; + ++ mutex_lock(&c->lock); + rcu_read_lock(); + tbl = rht_dereference_rcu(c->table.tbl, &c->table); + for (i = 0; i < tbl->size; i++) @@ -24574,13 +25302,14 @@ index 000000000000..35e941949f49 + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(c, ck); -+ rcu_read_unlock(); -+ return ck; ++ goto out; + } + } ++ ck = NULL; ++out: + rcu_read_unlock(); -+ -+ return NULL; ++ mutex_unlock(&c->lock); ++ return ck; +} + +static struct bkey_cached * @@ -24592,7 +25321,7 @@ index 000000000000..35e941949f49 + bool was_new = true; + + ck = bkey_cached_alloc(trans, path); -+ if (unlikely(IS_ERR(ck))) ++ if (IS_ERR(ck)) + return ck; + + if (unlikely(!ck)) { @@ -24713,7 +25442,7 @@ index 000000000000..35e941949f49 + return ret; +} + -+noinline static int ++static noinline int +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, + unsigned flags) +{ @@ -24894,7 +25623,7 @@ index 000000000000..35e941949f49 + * Since journal reclaim depends on us making progress here, and the + * allocator/copygc depend on journal reclaim making progress, we need + * to be using alloc reserves: -+ * */ ++ */ + ret = bch2_btree_iter_traverse(&b_iter) ?: + bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_KEY_CACHE_RECLAIM| @@ -25074,12 +25803,7 @@ index 000000000000..35e941949f49 + unsigned start, flags; + int srcu_idx; + -+ /* Return -1 if we can't do anything right now */ -+ if (sc->gfp_mask & __GFP_FS) -+ mutex_lock(&bc->lock); -+ else if (!mutex_trylock(&bc->lock)) -+ return -1; -+ ++ mutex_lock(&bc->lock); + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + flags = memalloc_nofs_save(); + @@ -25184,23 +25908,31 @@ index 000000000000..35e941949f49 + struct bkey_cached *ck, *n; + struct rhash_head *pos; + unsigned i; ++#ifdef __KERNEL__ + int cpu; ++#endif + + if (bc->shrink.list.next) + unregister_shrinker(&bc->shrink); + + mutex_lock(&bc->lock); + -+ rcu_read_lock(); -+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); -+ if (tbl) -+ for (i = 0; i < tbl->size; i++) -+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { -+ bkey_cached_evict(bc, ck); -+ list_add(&ck->list, &bc->freed_nonpcpu); -+ } -+ rcu_read_unlock(); ++ /* ++ * The loop is needed to guard against racing with rehash: ++ */ ++ while (atomic_long_read(&bc->nr_keys)) { ++ rcu_read_lock(); ++ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); ++ if (tbl) ++ for (i = 0; i < tbl->size; i++) ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ bkey_cached_evict(bc, ck); ++ list_add(&ck->list, &bc->freed_nonpcpu); ++ } ++ rcu_read_unlock(); ++ } + ++#ifdef __KERNEL__ + for_each_possible_cpu(cpu) { + struct btree_key_cache_freelist *f = + per_cpu_ptr(bc->pcpu_freed, cpu); @@ -25210,6 +25942,7 @@ index 000000000000..35e941949f49 + list_add(&ck->list, &bc->freed_nonpcpu); + } + } ++#endif + + list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + @@ -25225,10 +25958,15 @@ index 000000000000..35e941949f49 + kmem_cache_free(bch2_key_cache, ck); + } + -+ BUG_ON(atomic_long_read(&bc->nr_dirty) && -+ !bch2_journal_error(&c->journal) && -+ test_bit(BCH_FS_WAS_RW, &c->flags)); -+ BUG_ON(atomic_long_read(&bc->nr_keys)); ++ if (atomic_long_read(&bc->nr_dirty) && ++ !bch2_journal_error(&c->journal) && ++ test_bit(BCH_FS_WAS_RW, &c->flags)) ++ panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", ++ atomic_long_read(&bc->nr_dirty)); ++ ++ if (atomic_long_read(&bc->nr_keys)) ++ panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", ++ atomic_long_read(&bc->nr_keys)); + + mutex_unlock(&bc->lock); + @@ -25258,9 +25996,11 @@ index 000000000000..35e941949f49 + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + int ret; + ++#ifdef __KERNEL__ + bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); + if (!bc->pcpu_freed) + return -ENOMEM; ++#endif + + ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params); + if (ret) @@ -25268,7 +26008,7 @@ index 000000000000..35e941949f49 + + bc->table_init_done = true; + -+ bc->shrink.seeks = 1; ++ bc->shrink.seeks = 0; + bc->shrink.count_objects = bch2_btree_key_cache_count; + bc->shrink.scan_objects = bch2_btree_key_cache_scan; + bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; @@ -25277,15 +26017,17 @@ index 000000000000..35e941949f49 + +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +{ -+ prt_printf(out, "nr_freed:\t%zu\n", atomic_long_read(&c->nr_freed)); -+ prt_printf(out, "nr_keys:\t%lu\n", atomic_long_read(&c->nr_keys)); -+ prt_printf(out, "nr_dirty:\t%lu\n", atomic_long_read(&c->nr_dirty)); ++ prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed)); ++ prt_newline(out); ++ prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); ++ prt_newline(out); ++ prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); ++ prt_newline(out); +} + +void bch2_btree_key_cache_exit(void) +{ -+ if (bch2_key_cache) -+ kmem_cache_destroy(bch2_key_cache); ++ kmem_cache_destroy(bch2_key_cache); +} + +int __init bch2_btree_key_cache_init(void) @@ -25298,10 +26040,11 @@ index 000000000000..35e941949f49 +} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h new file mode 100644 -index 000000000000..670746e72dab +index 000000000000..eccea15fca79 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.h -@@ -0,0 +1,47 @@ +@@ -0,0 +1,48 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_H +#define _BCACHEFS_BTREE_KEY_CACHE_H + @@ -25351,10 +26094,10 @@ index 000000000000..670746e72dab +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c new file mode 100644 -index 000000000000..f4340086c357 +index 000000000000..9d090437d8f6 --- /dev/null +++ b/fs/bcachefs/btree_locking.c -@@ -0,0 +1,676 @@ +@@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -25451,108 +26194,13 @@ index 000000000000..f4340086c357 + prt_newline(out); +} + -+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) -+{ -+ int ret; -+ -+ if (i == g->g) { -+ trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); -+ ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); -+ } else { -+ i->trans->lock_must_abort = true; -+ ret = 0; -+ } -+ -+ for (i = g->g + 1; i < g->g + g->nr; i++) -+ wake_up_process(i->trans->locking_wait.task); -+ return ret; -+} -+ -+static noinline int break_cycle(struct lock_graph *g) -+{ -+ struct trans_waiting_for_lock *i; -+ -+ for (i = g->g; i < g->g + g->nr; i++) { -+ if (i->trans->lock_may_not_fail || -+ i->trans->locking_wait.lock_want == SIX_LOCK_write) -+ continue; -+ -+ return abort_lock(g, i); -+ } -+ -+ for (i = g->g; i < g->g + g->nr; i++) { -+ if (i->trans->lock_may_not_fail || -+ !i->trans->in_traverse_all) -+ continue; -+ -+ return abort_lock(g, i); -+ } -+ -+ for (i = g->g; i < g->g + g->nr; i++) { -+ if (i->trans->lock_may_not_fail) -+ continue; -+ -+ return abort_lock(g, i); -+ } -+ -+ BUG(); -+} -+ -+static void lock_graph_pop(struct lock_graph *g) ++static void lock_graph_up(struct lock_graph *g) +{ + closure_put(&g->g[--g->nr].trans->ref); +} + -+static void lock_graph_pop_above(struct lock_graph *g, struct trans_waiting_for_lock *above, -+ struct printbuf *cycle) ++static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ -+ if (g->nr > 1 && cycle) -+ print_chain(cycle, g); -+ -+ while (g->g + g->nr > above) -+ lock_graph_pop(g); -+} -+ -+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, -+ struct printbuf *cycle) -+{ -+ struct btree_trans *orig_trans = g->g->trans; -+ struct trans_waiting_for_lock *i; -+ int ret = 0; -+ -+ for (i = g->g; i < g->g + g->nr; i++) { -+ if (i->trans->locking != i->node_want) { -+ lock_graph_pop_above(g, i - 1, cycle); -+ return 0; -+ } -+ -+ if (i->trans == trans) { -+ if (cycle) { -+ /* Only checking: */ -+ print_cycle(cycle, g); -+ ret = -1; -+ } else { -+ ret = break_cycle(g); -+ } -+ -+ if (ret) -+ goto deadlock; -+ /* -+ * If we didn't abort (instead telling another -+ * transaction to abort), keep checking: -+ */ -+ } -+ } -+ -+ if (g->nr == ARRAY_SIZE(g->g)) { -+ if (orig_trans->lock_may_not_fail) -+ return 0; -+ -+ trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); -+ ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); -+ goto deadlock; -+ } -+ + closure_get(&trans->ref); + + g->g[g->nr++] = (struct trans_waiting_for_lock) { @@ -25560,25 +26208,124 @@ index 000000000000..f4340086c357 + .node_want = trans->locking, + .lock_want = trans->locking_wait.lock_want, + }; -+ -+ return 0; -+deadlock: -+ lock_graph_pop_above(g, g->g, cycle); -+ return ret; +} + -+static noinline void lock_graph_remove_non_waiters(struct lock_graph *g, -+ struct printbuf *cycle) ++static bool lock_graph_remove_non_waiters(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { -+ lock_graph_pop_above(g, i - 1, cycle); -+ return; ++ while (g->g + g->nr > i) ++ lock_graph_up(g); ++ return true; + } -+ BUG(); ++ ++ return false; ++} ++ ++static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) ++{ ++ if (i == g->g) { ++ trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); ++ return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); ++ } else { ++ i->trans->lock_must_abort = true; ++ wake_up_process(i->trans->locking_wait.task); ++ return 0; ++ } ++} ++ ++static int btree_trans_abort_preference(struct btree_trans *trans) ++{ ++ if (trans->lock_may_not_fail) ++ return 0; ++ if (trans->locking_wait.lock_want == SIX_LOCK_write) ++ return 1; ++ if (!trans->in_traverse_all) ++ return 2; ++ return 3; ++} ++ ++static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) ++{ ++ struct trans_waiting_for_lock *i, *abort = NULL; ++ unsigned best = 0, pref; ++ int ret; ++ ++ if (lock_graph_remove_non_waiters(g)) ++ return 0; ++ ++ /* Only checking, for debugfs: */ ++ if (cycle) { ++ print_cycle(cycle, g); ++ ret = -1; ++ goto out; ++ } ++ ++ for (i = g->g; i < g->g + g->nr; i++) { ++ pref = btree_trans_abort_preference(i->trans); ++ if (pref > best) { ++ abort = i; ++ best = pref; ++ } ++ } ++ ++ if (unlikely(!best)) { ++ struct bch_fs *c = g->g->trans->c; ++ struct printbuf buf = PRINTBUF; ++ ++ bch_err(c, "cycle of nofail locks"); ++ ++ for (i = g->g; i < g->g + g->nr; i++) { ++ struct btree_trans *trans = i->trans; ++ ++ bch2_btree_trans_to_text(&buf, trans); ++ ++ prt_printf(&buf, "backtrace:"); ++ prt_newline(&buf); ++ printbuf_indent_add(&buf, 2); ++ bch2_prt_backtrace(&buf, trans->locking_wait.task); ++ printbuf_indent_sub(&buf, 2); ++ prt_newline(&buf); ++ } ++ ++ bch2_print_string_as_lines(KERN_ERR, buf.buf); ++ printbuf_exit(&buf); ++ BUG(); ++ } ++ ++ ret = abort_lock(g, abort); ++out: ++ if (ret) ++ while (g->nr) ++ lock_graph_up(g); ++ return ret; ++} ++ ++static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, ++ struct printbuf *cycle) ++{ ++ struct btree_trans *orig_trans = g->g->trans; ++ struct trans_waiting_for_lock *i; ++ ++ for (i = g->g; i < g->g + g->nr; i++) ++ if (i->trans == trans) ++ return break_cycle(g, cycle); ++ ++ if (g->nr == ARRAY_SIZE(g->g)) { ++ if (orig_trans->lock_may_not_fail) ++ return 0; ++ ++ while (g->nr) ++ lock_graph_up(g); ++ trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); ++ return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); ++ } ++ ++ lock_graph_down(g, trans); ++ return 0; +} + +static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) @@ -25600,8 +26347,7 @@ index 000000000000..f4340086c357 + } + + g.nr = 0; -+ ret = lock_graph_descend(&g, trans, cycle); -+ BUG_ON(ret); ++ lock_graph_down(&g, trans); +next: + if (!g.nr) + return 0; @@ -25628,8 +26374,8 @@ index 000000000000..f4340086c357 + + b = &READ_ONCE(path->l[top->level].b)->c; + -+ if (unlikely(IS_ERR_OR_NULL(b))) { -+ lock_graph_remove_non_waiters(&g, cycle); ++ if (IS_ERR_OR_NULL(b)) { ++ BUG_ON(!lock_graph_remove_non_waiters(&g)); + goto next; + } + @@ -25655,7 +26401,7 @@ index 000000000000..f4340086c357 + raw_spin_unlock(&b->lock.wait_lock); + + if (ret) -+ return ret < 0 ? ret : 0; ++ return ret; + goto next; + + } @@ -25665,7 +26411,7 @@ index 000000000000..f4340086c357 + + if (g.nr > 1 && cycle) + print_chain(cycle, &g); -+ lock_graph_pop(&g); ++ lock_graph_up(&g); + goto next; +} + @@ -25959,7 +26705,7 @@ index 000000000000..f4340086c357 + struct btree_path *path; + + if (unlikely(trans->restarted)) -+ return - ((int) trans->restarted); ++ return -((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && @@ -26033,10 +26779,10 @@ index 000000000000..f4340086c357 +#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 -index 000000000000..d91b42bf1de1 +index 000000000000..fb237c95ee13 --- /dev/null +++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,418 @@ +@@ -0,0 +1,419 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H @@ -26127,7 +26873,7 @@ index 000000000000..d91b42bf1de1 +{ + mark_btree_node_locked_noreset(path, level, type); +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS -+ path->l[level].lock_taken_time = ktime_get_ns(); ++ path->l[level].lock_taken_time = local_clock(); +#endif +} + @@ -26159,7 +26905,7 @@ index 000000000000..d91b42bf1de1 + if (s) + __bch2_time_stats_update(&s->lock_hold_times, + path->l[level].lock_taken_time, -+ ktime_get_ns()); ++ local_clock()); +#endif +} + @@ -26299,7 +27045,7 @@ index 000000000000..d91b42bf1de1 + btree_node_lock_increment(trans, b, level, type) || + !(ret = btree_node_lock_nopath(trans, b, type))) { +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS -+ path->l[b->level].lock_taken_time = ktime_get_ns(); ++ path->l[b->level].lock_taken_time = local_clock(); +#endif + } + @@ -26335,6 +27081,7 @@ index 000000000000..d91b42bf1de1 + struct btree_bkey_cached_common *b) +{ + int ret = __btree_node_lock_write(trans, path, b, true); ++ + BUG_ON(ret); +} + @@ -26457,10 +27204,10 @@ index 000000000000..d91b42bf1de1 +#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 -index 000000000000..af226eed818b +index 000000000000..d89489e4e4a5 --- /dev/null +++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,696 @@ +@@ -0,0 +1,708 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H @@ -26469,7 +27216,7 @@ index 000000000000..af226eed818b +#include +#include + -+#include "bkey_methods.h" ++//#include "bkey_methods.h" +#include "buckets_types.h" +#include "darray.h" +#include "journal_types.h" @@ -26540,6 +27287,7 @@ index 000000000000..af226eed818b + u8 nsets; + u8 nr_key_bits; + u16 version_ondisk; ++ u8 write_type; + + struct bkey_format format; + @@ -26623,6 +27371,16 @@ index 000000000000..af226eed818b + /* Number of elements in live + freeable lists */ + unsigned used; + unsigned reserve; ++ unsigned freed; ++ unsigned not_freed_lock_intent; ++ unsigned not_freed_lock_write; ++ unsigned not_freed_dirty; ++ unsigned not_freed_read_in_flight; ++ unsigned not_freed_write_in_flight; ++ unsigned not_freed_noevict; ++ unsigned not_freed_write_blocked; ++ unsigned not_freed_will_make_reachable; ++ unsigned not_freed_access_bit; + atomic_t dirty; + struct shrinker shrink; + @@ -26779,7 +27537,7 @@ index 000000000000..af226eed818b +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; -+} __attribute__((packed, aligned(4))); ++} __packed __aligned(4); + +#define BKEY_CACHED_ACCESSED 0 +#define BKEY_CACHED_DIRTY 1 @@ -26871,6 +27629,7 @@ index 000000000000..af226eed818b + bool in_traverse_all:1; + bool memory_allocation_failure:1; + bool is_initial_gc:1; ++ bool journal_replay_not_finished:1; + enum bch_errcode restarted:16; + u32 restart_count; + unsigned long last_restarted_ip; @@ -27159,7 +27918,7 @@ index 000000000000..af226eed818b +#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h new file mode 100644 -index 000000000000..89941fb8caa0 +index 000000000000..1c2e7b2b4ed5 --- /dev/null +++ b/fs/bcachefs/btree_update.h @@ -0,0 +1,158 @@ @@ -27173,8 +27932,8 @@ index 000000000000..89941fb8caa0 +struct bch_fs; +struct btree; + -+void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *, -+ struct btree *); ++void bch2_btree_node_prep_for_write(struct btree_trans *, ++ struct btree_path *, struct btree *); +bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_i *); @@ -27323,10 +28082,10 @@ index 000000000000..89941fb8caa0 +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 -index 000000000000..578ba747826e +index 000000000000..dac2fa6b08ee --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2352 @@ +@@ -0,0 +1,2437 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -27352,9 +28111,9 @@ index 000000000000..578ba747826e +#include +#include + -+static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, -+ struct btree_path *, struct btree *, -+ struct keylist *, unsigned); ++static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, ++ struct btree_path *, struct btree *, ++ struct keylist *, unsigned); +static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); + +static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, @@ -27366,8 +28125,8 @@ index 000000000000..578ba747826e + + path = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_NOPRESERVE| -+ BTREE_ITER_INTENT, _THIS_IP_); -+ path = bch2_btree_path_make_mut(trans, path, true, _THIS_IP_); ++ BTREE_ITER_INTENT, _RET_IP_); ++ path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path; @@ -27524,6 +28283,43 @@ index 000000000000..578ba747826e + } +} + ++static void bch2_btree_node_free_never_used(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; ++ struct btree_path *path; ++ unsigned level = b->c.level; ++ ++ BUG_ON(!list_empty(&b->write_blocked)); ++ BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); ++ ++ b->will_make_reachable = 0; ++ closure_put(&as->cl); ++ ++ clear_btree_node_will_make_reachable(b); ++ clear_btree_node_accessed(b); ++ clear_btree_node_dirty_acct(c, b); ++ clear_btree_node_need_write(b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_del_init(&b->list); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ BUG_ON(p->nr >= ARRAY_SIZE(p->b)); ++ p->b[p->nr++] = b; ++ ++ six_unlock_intent(&b->c.lock); ++ ++ trans_for_each_path(trans, path) ++ if (path->l[level].b == b) { ++ btree_node_unlock(trans, path, level); ++ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); ++ } ++} ++ +static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, + struct disk_reservation *res, + struct closure *cl, @@ -27538,6 +28334,7 @@ index 000000000000..578ba747826e + struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; + unsigned nr_reserve; + enum alloc_reserve alloc_reserve; ++ int ret; + + if (flags & BTREE_INSERT_USE_RESERVE) { + nr_reserve = 0; @@ -27560,7 +28357,7 @@ index 000000000000..578ba747826e + mutex_unlock(&c->btree_reserve_cache_lock); + +retry: -+ wp = bch2_alloc_sectors_start_trans(trans, ++ ret = bch2_alloc_sectors_start_trans(trans, + c->opts.metadata_target ?: + c->opts.foreground_target, + 0, @@ -27568,9 +28365,9 @@ index 000000000000..578ba747826e + &devs_have, + res->nr_replicas, + c->opts.metadata_replicas_required, -+ alloc_reserve, 0, cl); -+ if (IS_ERR(wp)) -+ return ERR_CAST(wp); ++ alloc_reserve, 0, cl, &wp); ++ if (unlikely(ret)) ++ return ERR_PTR(ret); + + if (wp->sectors_free < btree_sectors(c)) { + struct open_bucket *ob; @@ -27722,9 +28519,6 @@ index 000000000000..578ba747826e + btree_node_set_format(b, b->data->format); + bch2_btree_build_aux_trees(b); + -+ bch2_btree_update_add_new_node(as, b); -+ six_unlock_write(&b->c.lock); -+ + return b; +} + @@ -27960,7 +28754,7 @@ index 000000000000..578ba747826e + bch2_trans_unlock(&trans); + + bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, -+ "error %i in btree_update_nodes_written()", ret); ++ "%s(): error %s", __func__, bch2_err_str(ret)); +err: + if (as->b) { + struct btree_path *path; @@ -28188,6 +28982,14 @@ index 000000000000..578ba747826e + mutex_unlock(&c->btree_interior_update_lock); + + btree_update_add_key(as, &as->new_keys, b); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; ++ unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; ++ ++ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = ++ cpu_to_le16(sectors); ++ } +} + +/* @@ -28355,24 +29157,24 @@ index 000000000000..578ba747826e + nr_nodes[!!update_level] += 1 + split; + update_level++; + -+ if (!btree_path_node(path, update_level)) ++ ret = bch2_btree_path_upgrade(trans, path, update_level + 1); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ if (!btree_path_node(path, update_level)) { ++ /* Allocating new root? */ ++ nr_nodes[1] += split; ++ update_level = BTREE_MAX_DEPTH; ++ break; ++ } ++ ++ if (bch2_btree_node_insert_fits(c, path->l[update_level].b, ++ BKEY_BTREE_PTR_U64s_MAX * (1 + split))) + break; + -+ /* -+ * XXX: figure out how far we might need to split, -+ * instead of locking/reserving all the way to the root: -+ */ -+ split = update_level + 1 < BTREE_MAX_DEPTH; ++ split = true; + } + -+ /* Might have to allocate a new root: */ -+ if (update_level < BTREE_MAX_DEPTH) -+ nr_nodes[1] += 1; -+ -+ ret = bch2_btree_path_upgrade(trans, path, U8_MAX); -+ if (ret) -+ return ERR_PTR(ret); -+ + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + else if (!down_read_trylock(&c->gc_lock)) { @@ -28393,6 +29195,7 @@ index 000000000000..578ba747826e + as->mode = BTREE_INTERIOR_NO_UPDATE; + as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->btree_id = path->btree_id; ++ as->update_level = update_level; + INIT_LIST_HEAD(&as->list); + INIT_LIST_HEAD(&as->unwritten_list); + INIT_LIST_HEAD(&as->write_blocked_list); @@ -28464,7 +29267,8 @@ index 000000000000..578ba747826e + } + + if (ret) { -+ trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]); ++ trace_and_count(c, btree_reserve_get_fail, trans->fn, ++ _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); + goto err; + } + @@ -28520,7 +29324,6 @@ index 000000000000..578ba747826e + struct btree *old; + + trace_and_count(c, btree_node_set_root, c, b); -+ BUG_ON(!b->written); + + old = btree_node_root(c, b); + @@ -28594,6 +29397,7 @@ index 000000000000..578ba747826e + bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); + set_btree_node_dirty_acct(c, b); + set_btree_node_need_write(b); ++ b->write_type = BTREE_WRITE_interior; + + printbuf_exit(&buf); +} @@ -28644,8 +29448,6 @@ index 000000000000..578ba747826e + SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); + n2->key.k.p = n1->key.k.p; + -+ bch2_btree_update_add_new_node(as, n2); -+ + set1 = btree_bset_first(n1); + set2 = btree_bset_first(n2); + @@ -28787,18 +29589,19 @@ index 000000000000..578ba747826e + btree_node_interior_verify(as->c, b); +} + -+static void btree_split(struct btree_update *as, struct btree_trans *trans, -+ struct btree_path *path, struct btree *b, -+ struct keylist *keys, unsigned flags) ++static int btree_split(struct btree_update *as, struct btree_trans *trans, ++ struct btree_path *path, struct btree *b, ++ struct keylist *keys, unsigned flags) +{ + struct bch_fs *c = as->c; + struct btree *parent = btree_node_parent(path, b); + struct btree *n1, *n2 = NULL, *n3 = NULL; + struct btree_path *path1 = NULL, *path2 = NULL; + u64 start_time = local_clock(); ++ int ret = 0; + + BUG_ON(!parent && (b != btree_node_root(c, b))); -+ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); ++ BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); + + bch2_btree_interior_update_will_free_node(as, b); + @@ -28814,6 +29617,9 @@ index 000000000000..578ba747826e + + bch2_btree_build_aux_trees(n2); + bch2_btree_build_aux_trees(n1); ++ ++ bch2_btree_update_add_new_node(as, n1); ++ bch2_btree_update_add_new_node(as, n2); + six_unlock_write(&n2->c.lock); + six_unlock_write(&n1->c.lock); + @@ -28827,11 +29633,6 @@ index 000000000000..578ba747826e + mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path2, n2); + -+ bch2_btree_update_add_new_node(as, n1); -+ -+ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); -+ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); -+ + /* + * Note that on recursive parent_keys == keys, so we + * can't start adding new keys to parent_keys before emptying it @@ -28844,6 +29645,9 @@ index 000000000000..578ba747826e + /* Depth increases, make a new root */ + n3 = __btree_root_alloc(as, trans, b->c.level + 1); + ++ bch2_btree_update_add_new_node(as, n3); ++ six_unlock_write(&n3->c.lock); ++ + path2->locks_want++; + BUG_ON(btree_node_locked(path2, n3->c.level)); + six_lock_increment(&n3->c.lock, SIX_LOCK_intent); @@ -28854,13 +29658,12 @@ index 000000000000..578ba747826e + n3->sib_u64s[1] = U16_MAX; + + btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); -+ -+ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); + } + } else { + trace_and_count(c, btree_node_compact, c, b); + + bch2_btree_build_aux_trees(n1); ++ bch2_btree_update_add_new_node(as, n1); + six_unlock_write(&n1->c.lock); + + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); @@ -28868,10 +29671,6 @@ index 000000000000..578ba747826e + mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path1, n1); + -+ bch2_btree_update_add_new_node(as, n1); -+ -+ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); -+ + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); + } @@ -28880,7 +29679,9 @@ index 000000000000..578ba747826e + + if (parent) { + /* Split a non root node */ -+ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); ++ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); ++ if (ret) ++ goto err; + } else if (n3) { + bch2_btree_set_root(as, trans, path, n3); + } else { @@ -28888,11 +29689,16 @@ index 000000000000..578ba747826e + bch2_btree_set_root(as, trans, path, n1); + } + -+ bch2_btree_update_get_open_buckets(as, n1); -+ if (n2) -+ bch2_btree_update_get_open_buckets(as, n2); -+ if (n3) ++ if (n3) { + bch2_btree_update_get_open_buckets(as, n3); ++ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); ++ } ++ if (n2) { ++ bch2_btree_update_get_open_buckets(as, n2); ++ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); ++ } ++ bch2_btree_update_get_open_buckets(as, n1); ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); + + /* + * The old node must be freed (in memory) _before_ unlocking the new @@ -28913,7 +29719,7 @@ index 000000000000..578ba747826e + if (n2) + six_unlock_intent(&n2->c.lock); + six_unlock_intent(&n1->c.lock); -+ ++out: + if (path2) { + __bch2_btree_path_unlock(trans, path2); + bch2_path_put(trans, path2, true); @@ -28929,6 +29735,14 @@ index 000000000000..578ba747826e + ? BCH_TIME_btree_node_split + : BCH_TIME_btree_node_compact], + start_time); ++ return ret; ++err: ++ if (n3) ++ bch2_btree_node_free_never_used(as, trans, n3); ++ if (n2) ++ bch2_btree_node_free_never_used(as, trans, n2); ++ bch2_btree_node_free_never_used(as, trans, n1); ++ goto out; +} + +static void @@ -28963,22 +29777,30 @@ index 000000000000..578ba747826e + * If a split occurred, this function will return early. This can only happen + * for leaf nodes -- inserts into interior nodes have to be atomic. + */ -+static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, -+ struct btree_path *path, struct btree *b, -+ struct keylist *keys, unsigned flags) ++static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, ++ struct btree_path *path, struct btree *b, ++ struct keylist *keys, unsigned flags) +{ + struct bch_fs *c = as->c; + int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; ++ int ret; + + lockdep_assert_held(&c->gc_lock); -+ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); ++ BUG_ON(!btree_node_intent_locked(path, b->c.level)); + BUG_ON(!b->c.level); + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + -+ bch2_btree_node_lock_for_insert(trans, path, b); ++ if (!(local_clock() & 63)) ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); ++ ++ ret = bch2_btree_node_lock_write(trans, path, &b->c); ++ if (ret) ++ return ret; ++ ++ bch2_btree_node_prep_for_write(trans, path, b); + + if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + bch2_btree_node_unlock_write(trans, path, b); @@ -29004,9 +29826,16 @@ index 000000000000..578ba747826e + bch2_btree_node_unlock_write(trans, path, b); + + btree_node_interior_verify(c, b); -+ return; ++ return 0; +split: -+ btree_split(as, trans, path, b, keys, flags); ++ /* ++ * We could attempt to avoid the transaction restart, by calling ++ * bch2_btree_path_upgrade() and allocating more nodes: ++ */ ++ if (b->c.level >= as->update_level) ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); ++ ++ return btree_split(as, trans, path, b, keys, flags); +} + +int bch2_btree_split_leaf(struct btree_trans *trans, @@ -29023,10 +29852,15 @@ index 000000000000..578ba747826e + if (IS_ERR(as)) + return PTR_ERR(as); + -+ btree_split(as, trans, path, b, NULL, flags); ++ ret = btree_split(as, trans, path, b, NULL, flags); ++ if (ret) { ++ bch2_btree_update_free(as, trans); ++ return ret; ++ } ++ + bch2_btree_update_done(as, trans); + -+ for (l = path->level + 1; btree_path_node(path, l) && !ret; l++) ++ for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) + ret = bch2_foreground_maybe_merge(trans, path, l, flags); + + return ret; @@ -29095,10 +29929,10 @@ index 000000000000..578ba747826e + bch2_bpos_to_text(&buf1, prev->data->max_key); + bch2_bpos_to_text(&buf2, next->data->min_key); + bch_err(c, -+ "btree topology error in btree merge:\n" ++ "%s(): btree topology error:\n" + " prev ends at %s\n" + " next starts at %s", -+ buf1.buf, buf2.buf); ++ __func__, buf1.buf, buf2.buf); + printbuf_exit(&buf1); + printbuf_exit(&buf2); + bch2_topology_error(c); @@ -29152,8 +29986,6 @@ index 000000000000..578ba747826e + btree_set_min(n, prev->data->min_key); + btree_set_max(n, next->data->max_key); + -+ bch2_btree_update_add_new_node(as, n); -+ + n->data->format = new_f; + btree_node_set_format(n, new_f); + @@ -29161,6 +29993,7 @@ index 000000000000..578ba747826e + bch2_btree_sort_into(c, n, next); + + bch2_btree_build_aux_trees(n); ++ bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); @@ -29168,8 +30001,6 @@ index 000000000000..578ba747826e + mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, new_path, n); + -+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -+ + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; + bch2_keylist_add(&as->parent_keys, &delete); @@ -29177,11 +30008,14 @@ index 000000000000..578ba747826e + + bch2_trans_verify_paths(trans); + -+ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); ++ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); ++ if (ret) ++ goto err_free_update; + + bch2_trans_verify_paths(trans); + + bch2_btree_update_get_open_buckets(as, n); ++ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, sib_path, m); @@ -29202,6 +30036,10 @@ index 000000000000..578ba747826e + bch2_path_put(trans, sib_path, true); + bch2_trans_verify_locks(trans); + return ret; ++err_free_update: ++ bch2_btree_node_free_never_used(as, trans, n); ++ bch2_btree_update_free(as, trans); ++ goto out; +} + +/** @@ -29230,9 +30068,9 @@ index 000000000000..578ba747826e + bch2_btree_interior_update_will_free_node(as, b); + + n = bch2_btree_node_alloc_replacement(as, trans, b); -+ bch2_btree_update_add_new_node(as, n); + + bch2_btree_build_aux_trees(n); ++ bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); @@ -29242,17 +30080,18 @@ index 000000000000..578ba747826e + + trace_and_count(c, btree_node_rewrite, c, b); + -+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -+ + if (parent) { + bch2_keylist_add(&as->parent_keys, &n->key); -+ bch2_btree_insert_node(as, trans, iter->path, parent, -+ &as->parent_keys, flags); ++ ret = bch2_btree_insert_node(as, trans, iter->path, parent, ++ &as->parent_keys, flags); ++ if (ret) ++ goto err; + } else { + bch2_btree_set_root(as, trans, iter->path, n); + } + + bch2_btree_update_get_open_buckets(as, n); ++ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, iter->path, b); + @@ -29260,10 +30099,15 @@ index 000000000000..578ba747826e + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as, trans); -+ bch2_path_put(trans, new_path, true); +out: ++ if (new_path) ++ bch2_path_put(trans, new_path, true); + bch2_btree_path_downgrade(trans, iter->path); + return ret; ++err: ++ bch2_btree_node_free_never_used(as, trans, n); ++ bch2_btree_update_free(as, trans); ++ goto out; +} + +struct async_btree_rewrite { @@ -29293,7 +30137,7 @@ index 000000000000..578ba747826e + goto out; + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); -+out : ++out: + bch2_trans_iter_exit(trans, &iter); + + return ret; @@ -29681,10 +30525,10 @@ index 000000000000..578ba747826e +} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h new file mode 100644 -index 000000000000..7af810df8348 +index 000000000000..2e6d220c3bcd --- /dev/null +++ b/fs/bcachefs/btree_update_interior.h -@@ -0,0 +1,322 @@ +@@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H +#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H @@ -29739,6 +30583,7 @@ index 000000000000..7af810df8348 + unsigned took_gc_lock:1; + + enum btree_id btree_id; ++ unsigned update_level; + + struct disk_reservation disk_res; + struct journal_preres journal_preres; @@ -29968,6 +30813,7 @@ index 000000000000..7af810df8348 + struct bkey_packed k; + + BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); ++ EBUG_ON(btree_node_just_written(b)); + + if (!bkey_pack_pos(&k, pos, b)) { + struct bkey *u = (void *) &k; @@ -30009,10 +30855,10 @@ index 000000000000..7af810df8348 +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c new file mode 100644 -index 000000000000..08d7001f7217 +index 000000000000..b930b788410d --- /dev/null +++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,1745 @@ +@@ -0,0 +1,1760 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -30071,9 +30917,9 @@ index 000000000000..08d7001f7217 + insert_l(&i[0])->b == insert_l(&i[1])->b; +} + -+static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b) ++inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) +{ + struct bch_fs *c = trans->c; + @@ -30092,14 +30938,6 @@ index 000000000000..08d7001f7217 + bch2_btree_init_next(trans, b); +} + -+void bch2_btree_node_lock_for_insert(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b) -+{ -+ bch2_btree_node_lock_write_nofail(trans, path, &b->c); -+ bch2_btree_node_prep_for_write(trans, path, b); -+} -+ +/* Inserting into a given leaf node (last stage of insert): */ + +/* Handle overwrites and do insert, for non extents: */ @@ -30204,6 +31042,8 @@ index 000000000000..08d7001f7217 + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); + ++ b->write_type = BTREE_WRITE_journal_reclaim; ++ + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + @@ -30312,7 +31152,7 @@ index 000000000000..08d7001f7217 + return 0; +} + -+static inline int bch2_trans_journal_res_get(struct btree_trans *trans, ++static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) +{ + struct bch_fs *c = trans->c; @@ -30362,7 +31202,7 @@ index 000000000000..08d7001f7217 +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; -+ unsigned old_u64s = ck->u64s, new_u64s; ++ unsigned new_u64s; + struct bkey_i *new_k; + + EBUG_ON(path->level); @@ -30391,12 +31231,7 @@ index 000000000000..08d7001f7217 + + ck->u64s = new_u64s; + ck->k = new_k; -+ /* -+ * Keys returned by peek() are no longer valid pointers, so we need a -+ * transaction restart: -+ */ -+ trace_and_count(c, trans_restart_key_cache_key_realloced, trans, _RET_IP_, path, old_u64s, new_u64s); -+ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced); ++ return 0; +} + +/* Triggers: */ @@ -30749,33 +31584,34 @@ index 000000000000..08d7001f7217 + return ret; +} + ++static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) ++{ ++ while (--i >= trans->updates) { ++ if (same_leaf_as_prev(trans, i)) ++ continue; ++ ++ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); ++ } ++ ++ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); ++} ++ +static inline int trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; -+ int ret; + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + -+ ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c); -+ if (ret) -+ goto fail; ++ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) ++ return trans_lock_write_fail(trans, i); + + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + return 0; -+fail: -+ while (--i >= trans->updates) { -+ if (same_leaf_as_prev(trans, i)) -+ continue; -+ -+ bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); -+ } -+ -+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) @@ -30786,6 +31622,33 @@ index 000000000000..08d7001f7217 + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); +} + ++static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ struct printbuf *err) ++{ ++ struct bch_fs *c = trans->c; ++ int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; ++ ++ printbuf_reset(err); ++ prt_printf(err, "invalid bkey on insert from %s -> %ps", ++ trans->fn, (void *) i->ip_allocated); ++ prt_newline(err); ++ printbuf_indent_add(err, 2); ++ ++ bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); ++ prt_newline(err); ++ ++ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), ++ i->bkey_type, rw, err); ++ bch2_print_string_as_lines(KERN_ERR, err->buf); ++ ++ bch2_inconsistent_error(c); ++ bch2_dump_trans_updates(trans); ++ printbuf_exit(err); ++ ++ return -EINVAL; ++} ++ +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ @@ -30800,24 +31663,9 @@ index 000000000000..08d7001f7217 + int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + + trans_for_each_update(trans, i) { -+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), -+ i->bkey_type, rw, &buf)) { -+ printbuf_reset(&buf); -+ prt_printf(&buf, "invalid bkey on insert from %s -> %ps", -+ trans->fn, (void *) i->ip_allocated); -+ prt_newline(&buf); -+ printbuf_indent_add(&buf, 2); -+ -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); -+ prt_newline(&buf); -+ -+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), -+ i->bkey_type, rw, &buf); -+ -+ bch2_trans_inconsistent(trans, "%s", buf.buf); -+ printbuf_exit(&buf); -+ return -EINVAL; -+ } ++ if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), ++ i->bkey_type, rw, &buf))) ++ return bch2_trans_commit_bkey_invalid(trans, i, &buf); + btree_insert_entry_checks(trans, i); + } + @@ -31411,11 +32259,41 @@ index 000000000000..08d7001f7217 +static int __must_check +bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, ++ unsigned long ip); ++ ++static noinline int flush_new_cached_update(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree_insert_entry *i, ++ enum btree_update_flags flags, ++ unsigned long ip) ++{ ++ struct btree_path *btree_path; ++ int ret; ++ ++ i->key_cache_already_flushed = true; ++ i->flags |= BTREE_TRIGGER_NORUN; ++ ++ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, ++ BTREE_ITER_INTENT, _THIS_IP_); ++ ++ ret = bch2_btree_path_traverse(trans, btree_path, 0); ++ if (ret) ++ goto err; ++ ++ btree_path_set_should_be_locked(btree_path); ++ ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip); ++err: ++ bch2_path_put(trans, btree_path, true); ++ return ret; ++} ++ ++static int __must_check ++bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, ++ struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i, n; -+ int ret = 0; + + BUG_ON(!path->should_be_locked); + @@ -31484,27 +32362,10 @@ index 000000000000..08d7001f7217 + * the key cache - but the key has to exist in the btree for that to + * work: + */ -+ if (path->cached && -+ bkey_deleted(&i->old_k)) { -+ struct btree_path *btree_path; ++ if (unlikely(path->cached && bkey_deleted(&i->old_k))) ++ return flush_new_cached_update(trans, path, i, flags, ip); + -+ i->key_cache_already_flushed = true; -+ i->flags |= BTREE_TRIGGER_NORUN; -+ -+ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, -+ BTREE_ITER_INTENT, _THIS_IP_); -+ -+ ret = bch2_btree_path_traverse(trans, btree_path, 0); -+ if (ret) -+ goto err; -+ -+ btree_path_set_should_be_locked(btree_path); -+ ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip); -+err: -+ bch2_path_put(trans, btree_path, true); -+ } -+ -+ return ret; ++ return 0; +} + +static int __must_check @@ -31760,10 +32621,10 @@ index 000000000000..08d7001f7217 +} diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 -index 000000000000..8af0dd022fda +index 000000000000..bf01837e1362 --- /dev/null +++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2113 @@ +@@ -0,0 +1,2117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. @@ -31855,20 +32716,17 @@ index 000000000000..8af0dd022fda + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); +} + -+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) ++void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) +{ + struct bch_fs *c = ca->fs; -+ struct bch_dev_usage ret; + unsigned seq, i, u64s = dev_usage_u64s(); + + do { + seq = read_seqcount_begin(&c->usage_lock); -+ memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); ++ memcpy(usage, ca->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) -+ acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); ++ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; +} + +static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, @@ -32341,10 +33199,11 @@ index 000000000000..8af0dd022fda + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_a.cached_sectors) { + ret = update_cached_sectors(c, new, ca->dev_idx, -+ -old_a.cached_sectors, ++ -((s64) old_a.cached_sectors), + journal_seq, gc); + if (ret) { -+ bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); ++ bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", ++ __func__); + return ret; + } + } @@ -32440,6 +33299,10 @@ index 000000000000..8af0dd022fda + if (bucket_data_type == BCH_DATA_cached) + bucket_data_type = BCH_DATA_user; + ++ if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || ++ (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) ++ bucket_data_type = ptr_data_type = BCH_DATA_stripe; ++ + if (gen_after(ptr->gen, b_gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" @@ -32685,7 +33548,7 @@ index 000000000000..8af0dd022fda +{ + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; @@ -32724,7 +33587,8 @@ index 000000000000..8af0dd022fda + ret = update_cached_sectors(c, k, p.ptr.dev, + disk_sectors, journal_seq, true); + if (ret) { -+ bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); ++ bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", ++ __func__); + return ret; + } + } @@ -32752,7 +33616,7 @@ index 000000000000..8af0dd022fda + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); -+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); ++ bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); + printbuf_exit(&buf); + return ret; + } @@ -32877,10 +33741,10 @@ index 000000000000..8af0dd022fda + u64 journal_seq = trans->journal_res.seq; + + if (flags & BTREE_TRIGGER_INSERT) { -+ struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v; ++ struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; + + BUG_ON(!journal_seq); -+ BUG_ON(new.k->type != KEY_TYPE_inode_v2); ++ BUG_ON(new.k->type != KEY_TYPE_inode_v3); + + v->bi_journal_seq = cpu_to_le64(journal_seq); + } @@ -32904,7 +33768,7 @@ index 000000000000..8af0dd022fda + unsigned flags) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bch_fs_usage __percpu *fs_usage; + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; @@ -32983,7 +33847,7 @@ index 000000000000..8af0dd022fda + unsigned flags) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; @@ -33026,23 +33890,24 @@ index 000000000000..8af0dd022fda + struct btree_insert_entry *i; + struct printbuf buf = PRINTBUF; + -+ bch_err(c, "disk usage increased %lli more than %u sectors reserved", -+ should_not_have_added, disk_res_sectors); ++ prt_printf(&buf, ++ bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"), ++ should_not_have_added, disk_res_sectors); + + trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + -+ pr_err("while inserting"); -+ printbuf_reset(&buf); ++ prt_str(&buf, "new "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); -+ pr_err(" %s", buf.buf); -+ pr_err("overlapping with"); -+ printbuf_reset(&buf); ++ prt_newline(&buf); ++ ++ prt_str(&buf, "old "); + bch2_bkey_val_to_text(&buf, c, old); -+ pr_err(" %s", buf.buf); ++ prt_newline(&buf); + } + + __WARN(); ++ bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + @@ -33712,7 +34577,7 @@ index 000000000000..8af0dd022fda + +#define SECTORS_CACHE 1024 + -+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, ++int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + u64 sectors, int flags) +{ + struct bch_fs_pcpu *pcpu; @@ -33875,14 +34740,14 @@ index 000000000000..8af0dd022fda + return -ENOMEM; + } + -+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; ++ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 -index 000000000000..6881502d95f1 +index 000000000000..01c706b73cee --- /dev/null +++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,300 @@ +@@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code for manipulating bucket marks for garbage collection. @@ -34024,7 +34889,15 @@ index 000000000000..6881502d95f1 + +/* Device usage: */ + -+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); ++void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *); ++static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) ++{ ++ struct bch_dev_usage ret; ++ ++ bch2_dev_usage_read_fast(ca, &ret); ++ return ret; ++} ++ +void bch2_dev_usage_init(struct bch_dev *); + +static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve) @@ -34125,8 +34998,6 @@ index 000000000000..6881502d95f1 +int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); + -+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -+ +int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, @@ -34138,15 +35009,35 @@ index 000000000000..6881502d95f1 +static inline void bch2_disk_reservation_put(struct bch_fs *c, + struct disk_reservation *res) +{ -+ this_cpu_sub(*c->online_reserved, res->sectors); -+ res->sectors = 0; ++ if (res->sectors) { ++ this_cpu_sub(*c->online_reserved, res->sectors); ++ res->sectors = 0; ++ } +} + +#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) + -+int bch2_disk_reservation_add(struct bch_fs *, -+ struct disk_reservation *, -+ u64, int); ++int __bch2_disk_reservation_add(struct bch_fs *, ++ struct disk_reservation *, ++ u64, int); ++ ++static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, ++ u64 sectors, int flags) ++{ ++ u64 old, new; ++ ++ do { ++ old = this_cpu_read(c->pcpu->sectors_available); ++ if (sectors > old) ++ return __bch2_disk_reservation_add(c, res, sectors, flags); ++ ++ new = old - sectors; ++ } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old); ++ ++ this_cpu_add(*c->online_reserved, sectors); ++ res->sectors += sectors; ++ return 0; ++} + +static inline struct disk_reservation +bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) @@ -35320,7 +36211,7 @@ index 000000000000..3a4890d39ff9 +#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c new file mode 100644 -index 000000000000..b5850a761b91 +index 000000000000..43d22fe8131b --- /dev/null +++ b/fs/bcachefs/checksum.c @@ -0,0 +1,712 @@ @@ -35457,7 +36348,7 @@ index 000000000000..b5850a761b91 + size_t orig_len = len; + int ret, i; + -+ sg = kmalloc_array(sizeof(*sg), pages, GFP_KERNEL); ++ sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); + if (!sg) + return -ENOMEM; + @@ -35642,7 +36533,7 @@ index 000000000000..b5850a761b91 + return __bch2_checksum_bio(c, type, nonce, bio, &iter); +} + -+int bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bio_vec bv; @@ -36038,10 +36929,10 @@ index 000000000000..b5850a761b91 +} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h new file mode 100644 -index 000000000000..c86c3c05d620 +index 000000000000..f7ccef7a5520 --- /dev/null +++ b/fs/bcachefs/checksum.h -@@ -0,0 +1,204 @@ +@@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CHECKSUM_H +#define _BCACHEFS_CHECKSUM_H @@ -36105,8 +36996,16 @@ index 000000000000..c86c3c05d620 + struct bch_extent_crc_unpacked *, + unsigned, unsigned, unsigned); + -+int bch2_encrypt_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); ++int __bch2_encrypt_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ return bch2_csum_type_is_encryption(type) ++ ? __bch2_encrypt_bio(c, type, nonce, bio) ++ : 0; ++} + +int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, + struct bch_key *); @@ -36122,15 +37021,15 @@ index 000000000000..c86c3c05d620 +{ + switch (type) { + case BCH_CSUM_OPT_none: -+ return BCH_CSUM_none; ++ return BCH_CSUM_none; + case BCH_CSUM_OPT_crc32c: -+ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; ++ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; + case BCH_CSUM_OPT_crc64: -+ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; ++ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; + case BCH_CSUM_OPT_xxhash: -+ return BCH_CSUM_xxhash; ++ return BCH_CSUM_xxhash; + default: -+ BUG(); ++ BUG(); + } +} + @@ -36532,7 +37431,7 @@ index 000000000000..5fae0012d808 +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 -index 000000000000..f692f35a6a98 +index 000000000000..2b7080b67eca --- /dev/null +++ b/fs/bcachefs/compress.c @@ -0,0 +1,639 @@ @@ -36915,7 +37814,7 @@ index 000000000000..f692f35a6a98 + + /* If it's only one block, don't bother trying to compress: */ + if (src->bi_iter.bi_size <= c->opts.block_size) -+ return 0; ++ return BCH_COMPRESSION_TYPE_incompressible; + + dst_data = bio_map_or_bounce(c, dst, WRITE); + src_data = bio_map_or_bounce(c, src, READ); @@ -37420,10 +38319,10 @@ index 000000000000..519ab9b96e67 +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c new file mode 100644 -index 000000000000..cb25efb68d3f +index 000000000000..d304c6cf77c6 --- /dev/null +++ b/fs/bcachefs/data_update.c -@@ -0,0 +1,373 @@ +@@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -37523,14 +38422,13 @@ index 000000000000..cb25efb68d3f + ptr->cached = true; +} + -+static int bch2_data_update_index_update(struct bch_write_op *op) ++int bch2_data_update_index_update(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct data_update *m = + container_of(op, struct data_update, op); -+ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); + struct keylist *keys = &op->insert_keys; + struct bkey_buf _new, _insert; + int ret = 0; @@ -37652,15 +38550,12 @@ index 000000000000..cb25efb68d3f + bch2_trans_update(&trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(&trans, &op->res, -+ op_journal_seq(op), ++ &op->journal_seq, + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); + -+ if (ec_ob) -+ bch2_ob_add_backpointer(c, ec_ob, &insert->k); -+ + this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); + trace_move_extent_finish(&new->k); + } @@ -37700,8 +38595,7 @@ index 000000000000..cb25efb68d3f +} + +void bch2_data_update_read_done(struct data_update *m, -+ struct bch_extent_crc_unpacked crc, -+ struct closure *cl) ++ struct bch_extent_crc_unpacked crc) +{ + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); @@ -37709,7 +38603,7 @@ index 000000000000..cb25efb68d3f + m->op.crc = crc; + m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + -+ closure_call(&m->op.cl, bch2_write, NULL, cl); ++ closure_call(&m->op.cl, bch2_write, NULL, NULL); +} + +void bch2_data_update_exit(struct data_update *update) @@ -37742,24 +38636,25 @@ index 000000000000..cb25efb68d3f + bch2_write_op_init(&m->op, c, io_opts); + m->op.pos = bkey_start_pos(k.k); + m->op.version = k.k->version; -+ m->op.target = data_opts.target, ++ m->op.target = data_opts.target; + m->op.write_point = wp; + m->op.flags |= BCH_WRITE_PAGES_STABLE| + BCH_WRITE_PAGES_OWNED| + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_FROM_INTERNAL| ++ BCH_WRITE_MOVE| + m->data_opts.write_flags; + m->op.compression_type = + bch2_compression_opt_to_type[io_opts.background_compression ?: + io_opts.compression]; + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) + m->op.alloc_reserve = RESERVE_movinggc; -+ m->op.index_update_fn = bch2_data_update_index_update; + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (p.ptr.cached) -+ m->data_opts.rewrite_ptrs &= ~(1U << i); ++ if (((1U << i) & m->data_opts.rewrite_ptrs) && ++ p.ptr.cached) ++ BUG(); + + if (!((1U << i) & m->data_opts.rewrite_ptrs)) + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); @@ -37795,14 +38690,32 @@ index 000000000000..cb25efb68d3f + + m->op.nr_replicas = m->op.nr_replicas_required = + hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas; ++ ++ BUG_ON(!m->op.nr_replicas); + return 0; +} ++ ++void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ unsigned i = 0; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { ++ opts->kill_ptrs |= 1U << i; ++ opts->rewrite_ptrs ^= 1U << i; ++ } ++ ++ i++; ++ } ++} diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h new file mode 100644 -index 000000000000..e64505453a55 +index 000000000000..5d8690795959 --- /dev/null +++ b/fs/bcachefs/data_update.h -@@ -0,0 +1,38 @@ +@@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BCACHEFS_DATA_UPDATE_H @@ -37815,6 +38728,7 @@ index 000000000000..e64505453a55 + +struct data_update_opts { + unsigned rewrite_ptrs; ++ unsigned kill_ptrs; + u16 target; + u8 extra_replicas; + unsigned btree_insert_flags; @@ -37830,23 +38744,25 @@ index 000000000000..e64505453a55 + struct bch_write_op op; +}; + ++int bch2_data_update_index_update(struct bch_write_op *); ++ +void bch2_data_update_read_done(struct data_update *, -+ struct bch_extent_crc_unpacked, -+ struct closure *); ++ struct bch_extent_crc_unpacked); + +void bch2_data_update_exit(struct data_update *); +int bch2_data_update_init(struct bch_fs *, struct data_update *, + struct write_point_specifier, + struct bch_io_opts, struct data_update_opts, + enum btree_id, struct bkey_s_c); ++void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); + +#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c new file mode 100644 -index 000000000000..1d2a16155073 +index 000000000000..57602c8e6c34 --- /dev/null +++ b/fs/bcachefs/debug.c -@@ -0,0 +1,831 @@ +@@ -0,0 +1,811 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Assorted bcachefs debug code @@ -38326,7 +39242,7 @@ index 000000000000..1d2a16155073 + if (i->iter < tbl->size) { + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); -+ i->iter++;; ++ i->iter++; + } else { + done = true; + } @@ -38350,26 +39266,6 @@ index 000000000000..1d2a16155073 + .read = bch2_cached_btree_nodes_read, +}; + -+static int prt_backtrace(struct printbuf *out, struct task_struct *task) -+{ -+ unsigned long entries[32]; -+ unsigned i, nr_entries; -+ int ret; -+ -+ ret = down_read_killable(&task->signal->exec_update_lock); -+ if (ret) -+ return ret; -+ -+ nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0); -+ for (i = 0; i < nr_entries; i++) { -+ prt_printf(out, "[<0>] %pB", (void *)entries[i]); -+ prt_newline(out); -+ } -+ -+ up_read(&task->signal->exec_update_lock); -+ return 0; -+} -+ +static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ @@ -38396,7 +39292,7 @@ index 000000000000..1d2a16155073 + prt_printf(&i->buf, "backtrace:"); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); -+ prt_backtrace(&i->buf, trans->locking_wait.task); ++ bch2_prt_backtrace(&i->buf, trans->locking_wait.task); + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + @@ -38506,11 +39402,11 @@ index 000000000000..1d2a16155073 + if (!i->size) + break; + -+ if (i->iter == ARRAY_SIZE(c->btree_transaction_fns) || -+ !c->btree_transaction_fns[i->iter]) ++ if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || ++ !bch2_btree_transaction_fns[i->iter]) + break; + -+ prt_printf(&i->buf, "%s: ", c->btree_transaction_fns[i->iter]); ++ prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); + @@ -38716,7 +39612,7 @@ index 000000000000..0b86736e5e1b +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c new file mode 100644 -index 000000000000..4d942d224a08 +index 000000000000..288f46b55876 --- /dev/null +++ b/fs/bcachefs/dirent.c @@ -0,0 +1,565 @@ @@ -38825,7 +39721,7 @@ index 000000000000..4d942d224a08 + + if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) { + prt_printf(err, "value too big (%zu > %u)", -+ bkey_val_u64s(k.k),dirent_val_u64s(len)); ++ bkey_val_u64s(k.k), dirent_val_u64s(len)); + return -EINVAL; + } + @@ -39287,7 +40183,7 @@ index 000000000000..4d942d224a08 +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h new file mode 100644 -index 000000000000..b1466932c768 +index 000000000000..1a2c9108f864 --- /dev/null +++ b/fs/bcachefs/dirent.h @@ -0,0 +1,67 @@ @@ -39302,10 +40198,10 @@ index 000000000000..b1466932c768 +int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + -+#define bch2_bkey_ops_dirent (struct bkey_ops) { \ ++#define bch2_bkey_ops_dirent ((struct bkey_ops) { \ + .key_invalid = bch2_dirent_invalid, \ + .val_to_text = bch2_dirent_to_text, \ -+} ++}) + +struct qstr; +struct file; @@ -39968,16 +40864,17 @@ index 000000000000..e4470c357a66 +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 -index 000000000000..aa8301146382 +index 000000000000..dfe37965d516 --- /dev/null +++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1673 @@ +@@ -0,0 +1,1680 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ + +#include "bcachefs.h" +#include "alloc_foreground.h" ++#include "backpointers.h" +#include "bkey_buf.h" +#include "bset.h" +#include "btree_gc.h" @@ -40794,17 +41691,13 @@ index 000000000000..aa8301146382 +static int ec_stripe_update_extent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, -+ struct ec_stripe_buf *s, -+ struct bpos end) ++ struct ec_stripe_buf *s) +{ + const struct bch_extent_ptr *ptr_c; + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bkey_i *n; + int ret, dev, block; + -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ return 1; -+ + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) + return 0; + @@ -40834,19 +41727,74 @@ index 000000000000..aa8301146382 + return bch2_trans_update(trans, iter, n, 0); +} + -+static int ec_stripe_update_extents(struct bch_fs *c, -+ struct ec_stripe_buf *s, -+ struct bkey *pos) ++static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, ++ unsigned block) +{ ++ struct bch_fs *c = trans->c; ++ struct bch_extent_ptr bucket = s->key.v.ptrs[block]; ++ struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); ++ struct bch_backpointer bp; + struct btree_iter iter; + struct bkey_s_c k; ++ u64 bp_offset = 0; ++ int ret = 0; ++retry: ++ while (1) { ++ bch2_trans_begin(trans); + -+ return bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_extents, bkey_start_pos(pos), -+ BTREE_ITER_NOT_EXTENTS|BTREE_ITER_INTENT, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL, -+ ec_stripe_update_extent(&trans, &iter, k, s, pos->p))); ++ ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen, ++ &bp_offset, &bp, ++ BTREE_ITER_CACHED); ++ if (ret) ++ break; ++ if (bp_offset == U64_MAX) ++ break; ++ ++ if (bch2_fs_inconsistent_on(bp.level, c, "found btree node in erasure coded bucket!?")) { ++ ret = -EIO; ++ break; ++ } ++ ++ k = bch2_backpointer_get_key(trans, &iter, bucket_pos, bp_offset, bp); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ if (!k.k) ++ continue; ++ ++ ret = ec_stripe_update_extent(trans, &iter, k, s); ++ bch2_trans_iter_exit(trans, &iter); ++ if (ret) ++ break; ++ ++ bp_offset++; ++ } ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ return ret; ++} ++ ++static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) ++{ ++ struct btree_trans trans; ++ struct bch_stripe *v = &s->key.v; ++ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr_data; i++) { ++ ret = ec_stripe_update_bucket(&trans, s, i); ++ if (ret) ++ break; ++ } ++ ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; +} + +/* @@ -40856,7 +41804,6 @@ index 000000000000..aa8301146382 +{ + struct bch_fs *c = s->c; + struct open_bucket *ob; -+ struct bkey_i *k; + struct stripe *m; + struct bch_stripe *v = &s->new_stripe.key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; @@ -40916,14 +41863,10 @@ index 000000000000..aa8301146382 + goto err_put_writes; + } + -+ for_each_keylist_key(&s->keys, k) { -+ ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k); -+ if (ret) { -+ bch_err(c, "error creating stripe: error updating pointers: %s", -+ bch2_err_str(ret)); -+ break; -+ } -+ } ++ ret = ec_stripe_update_extents(c, &s->new_stripe); ++ if (ret) ++ bch_err(c, "error creating stripe: error updating pointers: %s", ++ bch2_err_str(ret)); + + spin_lock(&c->ec_stripes_heap_lock); + m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset); @@ -40948,8 +41891,6 @@ index 000000000000..aa8301146382 + } + } + -+ bch2_keylist_free(&s->keys, s->inline_keys); -+ + ec_stripe_buf_exit(&s->existing_stripe); + ec_stripe_buf_exit(&s->new_stripe); + closure_debug_destroy(&s->iodone); @@ -41032,30 +41973,6 @@ index 000000000000..aa8301146382 + return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); +} + -+void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob, -+ struct bkey *k) -+{ -+ struct ec_stripe_new *ec = ob->ec; -+ -+ if (!ec) -+ return; -+ -+ mutex_lock(&ec->lock); -+ -+ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, -+ ARRAY_SIZE(ec->inline_keys), -+ BKEY_U64s)) { -+ BUG(); -+ } -+ -+ bkey_init(&ec->keys.top->k); -+ ec->keys.top->k.p = k->p; -+ ec->keys.top->k.size = k->size; -+ bch2_keylist_push(&ec->keys); -+ -+ mutex_unlock(&ec->lock); -+} -+ +static int unsigned_cmp(const void *_l, const void *_r) +{ + unsigned l = *((const unsigned *) _l); @@ -41148,8 +42065,6 @@ index 000000000000..aa8301146382 + BCH_BKEY_PTRS_MAX) - h->redundancy; + s->nr_parity = h->redundancy; + -+ bch2_keylist_init(&s->keys, s->inline_keys); -+ + ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, + s->nr_parity, h->blocksize); + @@ -41360,10 +42275,8 @@ index 000000000000..aa8301146382 + int ret; + + idx = get_existing_stripe(c, h); -+ if (idx < 0) { -+ bch_err(c, "failed to find an existing stripe"); ++ if (idx < 0) + return -BCH_ERR_ENOSPC_stripe_reuse; -+ } + + h->s->have_existing_stripe = true; + ret = get_stripe_key(c, idx, &h->s->existing_stripe); @@ -41401,21 +42314,9 @@ index 000000000000..aa8301146382 +static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, + struct ec_stripe_head *h) +{ -+ int ret; -+ -+ ret = bch2_disk_reservation_get(c, &h->s->res, -+ h->blocksize, -+ h->s->nr_parity, 0); -+ -+ if (ret) { -+ /* -+ * This means we need to wait for copygc to -+ * empty out buckets from existing stripes: -+ */ -+ bch_err(c, "failed to reserve stripe"); -+ } -+ -+ return ret; ++ return bch2_disk_reservation_get(c, &h->s->res, ++ h->blocksize, ++ h->s->nr_parity, 0); +} + +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, @@ -41457,8 +42358,10 @@ index 000000000000..aa8301146382 + ret = __bch2_ec_stripe_head_reserve(c, h); + if (ret && needs_stripe_new) + ret = __bch2_ec_stripe_head_reuse(c, h); -+ if (ret) ++ if (ret) { ++ bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret)); + goto err; ++ } + + if (!h->s->allocated) { + ret = new_stripe_alloc_buckets(c, h, cl); @@ -41647,30 +42550,29 @@ index 000000000000..aa8301146382 +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 -index 000000000000..a4c13d61af10 +index 000000000000..aba1e82bc889 --- /dev/null +++ b/fs/bcachefs/ec.h -@@ -0,0 +1,230 @@ +@@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H + +#include "ec_types.h" +#include "buckets_types.h" -+#include "keylist_types.h" + +int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, + int rw, struct printbuf *); +void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + -+#define bch2_bkey_ops_stripe (struct bkey_ops) { \ ++#define bch2_bkey_ops_stripe ((struct bkey_ops) { \ + .key_invalid = bch2_stripe_invalid, \ + .val_to_text = bch2_stripe_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_stripe, \ + .atomic_trigger = bch2_mark_stripe, \ -+} ++}) + +static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) +{ @@ -41819,9 +42721,6 @@ index 000000000000..a4c13d61af10 + open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; + struct disk_reservation res; + -+ struct keylist keys; -+ u64 inline_keys[BKEY_U64s * 8]; -+ + struct ec_stripe_buf new_stripe; + struct ec_stripe_buf existing_stripe; +}; @@ -41849,8 +42748,6 @@ index 000000000000..a4c13d61af10 +int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); + +void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -+void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *, -+ struct bkey *); + +void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); @@ -41935,10 +42832,10 @@ index 000000000000..edd93da663c1 +#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c new file mode 100644 -index 000000000000..cc9ce0be356e +index 000000000000..dc906fc9176f --- /dev/null +++ b/fs/bcachefs/errcode.c -@@ -0,0 +1,62 @@ +@@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -41964,6 +42861,7 @@ index 000000000000..cc9ce0be356e +const char *bch2_err_str(int err) +{ + const char *errstr; ++ + err = abs(err); + + BUG_ON(err >= BCH_ERR_MAX); @@ -42003,10 +42901,10 @@ index 000000000000..cc9ce0be356e +} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h new file mode 100644 -index 000000000000..fc0bb5f8873a +index 000000000000..9f293040b253 --- /dev/null +++ b/fs/bcachefs/errcode.h -@@ -0,0 +1,96 @@ +@@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H @@ -42051,6 +42949,7 @@ index 000000000000..fc0bb5f8873a + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ + x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ ++ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ + x(BCH_ERR_transaction_restart, transaction_restart_nested) \ + x(0, no_btree_node) \ + x(BCH_ERR_no_btree_node, no_btree_node_relock) \ @@ -42105,10 +43004,10 @@ index 000000000000..fc0bb5f8873a +#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c new file mode 100644 -index 000000000000..762abdf2f283 +index 000000000000..2fb5102ee31d --- /dev/null +++ b/fs/bcachefs/error.c -@@ -0,0 +1,218 @@ +@@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "error.h" @@ -42215,7 +43114,7 @@ index 000000000000..762abdf2f283 +{ + struct fsck_err_state *s = NULL; + va_list args; -+ bool print = true, suppressing = false; ++ bool print = true, suppressing = false, inconsistent = false; + struct printbuf buf = PRINTBUF, *out = &buf; + int ret = -BCH_ERR_fsck_ignore; + @@ -42247,7 +43146,7 @@ index 000000000000..762abdf2f283 + if (c->opts.errors != BCH_ON_ERROR_continue || + !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { + prt_str(out, ", shutting down"); -+ bch2_inconsistent_error(c); ++ inconsistent = true; + ret = -BCH_ERR_fsck_errors_not_fixed; + } else if (flags & FSCK_CAN_FIX) { + prt_str(out, ", fixing"); @@ -42300,6 +43199,9 @@ index 000000000000..762abdf2f283 + + printbuf_exit(&buf); + ++ if (inconsistent) ++ bch2_inconsistent_error(c); ++ + if (ret == -BCH_ERR_fsck_fix) { + set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + } else { @@ -42759,7 +43661,7 @@ index 000000000000..6f5cf449361a +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 -index 000000000000..2ca13014b9c4 +index 000000000000..9e2a4ed48b42 --- /dev/null +++ b/fs/bcachefs/extents.c @@ -0,0 +1,1324 @@ @@ -43057,7 +43959,7 @@ index 000000000000..2ca13014b9c4 + if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= + lp.crc.uncompressed_size) { + /* can use left extent's crc entry */ -+ } else if (lp.crc.live_size <= rp.crc.offset ) { ++ } else if (lp.crc.live_size <= rp.crc.offset) { + /* can use right extent's crc entry */ + } else { + /* check if checksums can be merged: */ @@ -43116,7 +44018,7 @@ index 000000000000..2ca13014b9c4 + if (crc_l.offset + crc_l.live_size + crc_r.live_size <= + crc_l.uncompressed_size) { + /* can use left extent's crc entry */ -+ } else if (crc_l.live_size <= crc_r.offset ) { ++ } else if (crc_l.live_size <= crc_r.offset) { + /* can use right extent's crc entry */ + crc_r.offset -= crc_l.live_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, @@ -44089,10 +44991,10 @@ index 000000000000..2ca13014b9c4 +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h new file mode 100644 -index 000000000000..3c17b81130bb +index 000000000000..224df17206cb --- /dev/null +++ b/fs/bcachefs/extents.h -@@ -0,0 +1,685 @@ +@@ -0,0 +1,689 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_H +#define _BCACHEFS_EXTENTS_H @@ -44293,6 +45195,7 @@ index 000000000000..3c17b81130bb + switch (k.k->type) { + case KEY_TYPE_btree_ptr: { + struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); ++ + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) @@ -44300,6 +45203,7 @@ index 000000000000..3c17b81130bb + } + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ + return (struct bkey_ptrs_c) { + e.v->start, + extent_entry_last(e) @@ -44307,6 +45211,7 @@ index 000000000000..3c17b81130bb + } + case KEY_TYPE_stripe: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ + return (struct bkey_ptrs_c) { + to_entry(&s.v->ptrs[0]), + to_entry(&s.v->ptrs[s.v->nr_blocks]), @@ -44322,6 +45227,7 @@ index 000000000000..3c17b81130bb + } + case KEY_TYPE_btree_ptr_v2: { + struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); ++ + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) @@ -44437,7 +45343,7 @@ index 000000000000..3c17b81130bb + +#define extent_for_each_entry_from(_e, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, \ -+ extent_entry_last(_e),_entry) ++ extent_entry_last(_e), _entry) + +#define extent_for_each_entry(_e, _entry) \ + extent_for_each_entry_from(_e, _entry, (_e).v->start) @@ -44471,28 +45377,28 @@ index 000000000000..3c17b81130bb +void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); + -+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ ++#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ -+} ++}) + -+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ ++#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_v2_invalid, \ + .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ -+} ++}) + +/* KEY_TYPE_extent: */ + +bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + -+#define bch2_bkey_ops_extent (struct bkey_ops) { \ ++#define bch2_bkey_ops_extent ((struct bkey_ops) { \ + .key_invalid = bch2_bkey_ptrs_invalid, \ + .val_to_text = bch2_bkey_ptrs_to_text, \ + .swab = bch2_ptr_swab, \ @@ -44500,7 +45406,7 @@ index 000000000000..3c17b81130bb + .key_merge = bch2_extent_merge, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ -+} ++}) + +/* KEY_TYPE_reservation: */ + @@ -44509,13 +45415,13 @@ index 000000000000..3c17b81130bb +void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + -+#define bch2_bkey_ops_reservation (struct bkey_ops) { \ ++#define bch2_bkey_ops_reservation ((struct bkey_ops) { \ + .key_invalid = bch2_reservation_invalid, \ + .val_to_text = bch2_reservation_to_text, \ + .key_merge = bch2_reservation_merge, \ + .trans_trigger = bch2_trans_mark_reservation, \ + .atomic_trigger = bch2_mark_reservation, \ -+} ++}) + +/* Extent checksum entries: */ + @@ -45113,7 +46019,7 @@ index 000000000000..05429c9631cd +#endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h new file mode 100644 -index 000000000000..cdb272708a4b +index 000000000000..66b945be10c2 --- /dev/null +++ b/fs/bcachefs/fifo.h @@ -0,0 +1,127 @@ @@ -45184,7 +46090,7 @@ index 000000000000..cdb272708a4b + (((p) - (fifo)->data))) + +#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) -+#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] ++#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask]) + +#define fifo_push_back_ref(f) \ + (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) @@ -45246,10 +46152,10 @@ index 000000000000..cdb272708a4b +#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 -index 000000000000..53ffc684223c +index 000000000000..1f2e1fc4f6b2 --- /dev/null +++ b/fs/bcachefs/fs-common.c -@@ -0,0 +1,496 @@ +@@ -0,0 +1,501 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -45464,6 +46370,11 @@ index 000000000000..53ffc684223c + if (ret) + goto err; + ++ if (bch2_reinherit_attrs(inode_u, dir_u)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ + dir_u->bi_mtime = dir_u->bi_ctime = now; + + dir_hash = bch2_hash_info_init(c, dir_u); @@ -45734,11 +46645,11 @@ index 000000000000..53ffc684223c + ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: + (src_dir.inum != dst_dir.inum + ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) -+ : 0 ) ?: ++ : 0) ?: + bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: + (dst_inum.inum + ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) -+ : 0 ); ++ : 0); +err: + bch2_trans_iter_exit(trans, &dst_inode_iter); + bch2_trans_iter_exit(trans, &src_inode_iter); @@ -45797,10 +46708,10 @@ index 000000000000..dde237859514 +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c new file mode 100644 -index 000000000000..7d45f4863469 +index 000000000000..fd3c3ea3ce18 --- /dev/null +++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,3421 @@ +@@ -0,0 +1,3577 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -45868,7 +46779,6 @@ index 000000000000..7d45f4863469 +}; + +struct bch_writepage_io { -+ struct closure cl; + struct bch_inode_info *inode; + + /* must be last: */ @@ -45876,8 +46786,9 @@ index 000000000000..7d45f4863469 +}; + +struct dio_write { -+ struct completion done; + struct kiocb *req; ++ struct address_space *mapping; ++ struct bch_inode_info *inode; + struct mm_struct *mm; + unsigned loop:1, + sync:1, @@ -45901,7 +46812,7 @@ index 000000000000..7d45f4863469 +}; + +/* pagecache_block must be held */ -+static int write_invalidate_inode_pages_range(struct address_space *mapping, ++static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + int ret; @@ -45954,7 +46865,7 @@ index 000000000000..7d45f4863469 +static int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, -+ unsigned sectors, ++ u64 sectors, + bool check_enospc) +{ + int ret; @@ -46409,7 +47320,7 @@ index 000000000000..7d45f4863469 +static int bch2_page_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, struct page *page, + struct bch2_page_reservation *res, -+ unsigned offset, unsigned len, bool check_enospc) ++ unsigned offset, unsigned len) +{ + struct bch_page_state *s = bch2_page_state_create(page, 0); + unsigned i, disk_sectors = 0, quota_sectors = 0; @@ -46429,19 +47340,14 @@ index 000000000000..7d45f4863469 + } + + if (disk_sectors) { -+ ret = bch2_disk_reservation_add(c, &res->disk, -+ disk_sectors, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); ++ ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + if (unlikely(ret)) + return ret; + } + + if (quota_sectors) { + ret = bch2_quota_reservation_add(c, inode, &res->quota, -+ quota_sectors, -+ check_enospc); ++ quota_sectors, true); + if (unlikely(ret)) { + struct disk_reservation tmp = { + .sectors = disk_sectors @@ -46625,7 +47531,7 @@ index 000000000000..7d45f4863469 + } + } + -+ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { ++ if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + goto out; @@ -47008,18 +47914,10 @@ index 000000000000..7d45f4863469 + }; +} + -+static void bch2_writepage_io_free(struct closure *cl) ++static void bch2_writepage_io_done(struct bch_write_op *op) +{ -+ struct bch_writepage_io *io = container_of(cl, -+ struct bch_writepage_io, cl); -+ -+ bio_put(&io->op.wbio.bio); -+} -+ -+static void bch2_writepage_io_done(struct closure *cl) -+{ -+ struct bch_writepage_io *io = container_of(cl, -+ struct bch_writepage_io, cl); ++ struct bch_writepage_io *io = ++ container_of(op, struct bch_writepage_io, op); + struct bch_fs *c = io->op.c; + struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; @@ -47081,7 +47979,7 @@ index 000000000000..7d45f4863469 + end_page_writeback(bvec->bv_page); + } + -+ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); ++ bio_put(&io->op.wbio.bio); +} + +static void bch2_writepage_do_io(struct bch_writepage_state *w) @@ -47089,8 +47987,7 @@ index 000000000000..7d45f4863469 + struct bch_writepage_io *io = w->io; + + w->io = NULL; -+ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); -+ continue_at(&io->cl, bch2_writepage_io_done, NULL); ++ closure_call(&io->op.cl, bch2_write, NULL, NULL); +} + +/* @@ -47112,9 +48009,7 @@ index 000000000000..7d45f4863469 + &c->writepage_bioset), + struct bch_writepage_io, op.wbio.bio); + -+ closure_init(&w->io->cl, NULL); + w->io->inode = inode; -+ + op = &w->io->op; + bch2_write_op_init(op, c, w->opts); + op->target = w->opts.foreground_target; @@ -47123,6 +48018,7 @@ index 000000000000..7d45f4863469 + op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->subvol = inode->ei_subvol; + op->pos = POS(inode->v.i_ino, sector); ++ op->end_io = bch2_writepage_io_done; + op->wbio.bio.bi_iter.bi_sector = sector; + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); +} @@ -47246,7 +48142,8 @@ index 000000000000..7d45f4863469 + + /* Check for writing past i_size: */ + WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > -+ round_up(i_size, block_bytes(c))); ++ round_up(i_size, block_bytes(c)) && ++ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; @@ -47330,11 +48227,10 @@ index 000000000000..7d45f4863469 + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); + if (ret) -+ goto out; ++ goto err; + } + -+ ret = bch2_page_reservation_get(c, inode, page, res, -+ offset, len, true); ++ ret = bch2_page_reservation_get(c, inode, page, res, offset, len); + if (ret) { + if (!PageUptodate(page)) { + /* @@ -47475,10 +48371,21 @@ index 000000000000..7d45f4863469 + goto out; + } + ++ /* ++ * XXX: per POSIX and fstests generic/275, on -ENOSPC we're ++ * supposed to write as much as we have disk space for. ++ * ++ * On failure here we should still write out a partial page if ++ * we aren't completely out of disk space - we don't do that ++ * yet: ++ */ + ret = bch2_page_reservation_get(c, inode, page, &res, -+ pg_offset, pg_len, true); -+ if (ret) -+ goto out; ++ pg_offset, pg_len); ++ if (unlikely(ret)) { ++ if (!reserved) ++ goto out; ++ break; ++ } + + reserved += pg_len; + } @@ -47487,13 +48394,13 @@ index 000000000000..7d45f4863469 + for (i = 0; i < nr_pages; i++) + flush_dcache_page(pages[i]); + -+ while (copied < len) { ++ while (copied < reserved) { + struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; + unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, len - copied, ++ unsigned pg_len = min_t(unsigned, reserved - copied, + PAGE_SIZE - pg_offset); + unsigned pg_copied = copy_page_from_iter_atomic(page, -+ pg_offset, pg_len,iter); ++ pg_offset, pg_len, iter); + + if (!pg_copied) + break; @@ -47779,11 +48686,13 @@ index 000000000000..7d45f4863469 + if (iocb->ki_flags & IOCB_DIRECT) { + struct blk_plug plug; + -+ ret = filemap_write_and_wait_range(mapping, -+ iocb->ki_pos, -+ iocb->ki_pos + count - 1); -+ if (ret < 0) -+ goto out; ++ if (unlikely(mapping->nrpages)) { ++ ret = filemap_write_and_wait_range(mapping, ++ iocb->ki_pos, ++ iocb->ki_pos + count - 1); ++ if (ret < 0) ++ goto out; ++ } + + file_accessed(file); + @@ -47848,31 +48757,107 @@ index 000000000000..7d45f4863469 + return err ? false : ret; +} + ++static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) ++{ ++ struct bch_fs *c = dio->op.c; ++ struct bch_inode_info *inode = dio->inode; ++ struct bio *bio = &dio->op.wbio.bio; ++ ++ return bch2_check_range_allocated(c, inode_inum(inode), ++ dio->op.pos.offset, bio_sectors(bio), ++ dio->op.opts.data_replicas, ++ dio->op.opts.compression != 0); ++} ++ +static void bch2_dio_write_loop_async(struct bch_write_op *); + -+static long bch2_dio_write_loop(struct dio_write *dio) ++static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) +{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct iovec *iov = dio->inline_vecs; ++ ++ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { ++ iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), ++ GFP_KERNEL); ++ if (unlikely(!iov)) ++ return -ENOMEM; ++ ++ dio->free_iov = true; ++ } ++ ++ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); ++ dio->iter.iov = iov; ++ return 0; ++} ++ ++static __always_inline long bch2_dio_write_done(struct dio_write *dio) ++{ ++ struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; -+ struct address_space *mapping = req->ki_filp->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_inode_info *inode = dio->inode; ++ bool sync = dio->sync; ++ long ret = dio->op.error ?: ((long) dio->written << 9); ++ ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ ++ if (dio->free_iov) ++ kfree(dio->iter.iov); ++ bio_put(&dio->op.wbio.bio); ++ ++ /* inode->i_dio_count is our ref on inode and thus bch_fs */ ++ inode_dio_end(&inode->v); ++ ++ if (ret < 0) ++ ret = bch2_err_class(ret); ++ ++ if (!sync) { ++ req->ki_complete(req, ret); ++ ret = -EIOCBQUEUED; ++ } ++ return ret; ++} ++ ++static __always_inline void bch2_dio_write_end(struct dio_write *dio) ++{ ++ struct bch_fs *c = dio->op.c; ++ struct kiocb *req = dio->req; ++ struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; + struct bio_vec *bv; ++ ++ i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); ++ req->ki_pos += (u64) dio->op.written << 9; ++ dio->written += dio->op.written; ++ ++ spin_lock(&inode->v.i_lock); ++ if (req->ki_pos > inode->v.i_size) ++ i_size_write(&inode->v, req->ki_pos); ++ spin_unlock(&inode->v.i_lock); ++ ++ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ ++ if (unlikely(dio->op.error)) ++ set_bit(EI_INODE_ERROR, &inode->ei_flags); ++} ++ ++static long bch2_dio_write_loop(struct dio_write *dio) ++{ ++ struct bch_fs *c = dio->op.c; ++ struct kiocb *req = dio->req; ++ struct address_space *mapping = dio->mapping; ++ struct bch_inode_info *inode = dio->inode; ++ struct bio *bio = &dio->op.wbio.bio; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; + long ret; + -+ if (dio->loop) -+ goto loop; -+ + while (1) { + iter_count = dio->iter.count; + -+ if (kthread && dio->mm) -+ kthread_use_mm(dio->mm); -+ BUG_ON(current->faults_disabled_mapping); ++ EBUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; + + ret = bio_iov_iter_get_pages(bio, &dio->iter); @@ -47880,8 +48865,6 @@ index 000000000000..7d45f4863469 + dropped_locks = fdm_dropped_locks(); + + current->faults_disabled_mapping = NULL; -+ if (kthread && dio->mm) -+ kthread_unuse_mm(dio->mm); + + /* + * If the fault handler returned an error but also signalled @@ -47919,13 +48902,17 @@ index 000000000000..7d45f4863469 + } + + bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); -+ dio->op.end_io = bch2_dio_write_loop_async; ++ dio->op.end_io = sync ++ ? NULL ++ : bch2_dio_write_loop_async; + dio->op.target = dio->op.opts.foreground_target; + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; + dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); + ++ if (sync) ++ dio->op.flags |= BCH_WRITE_SYNC; + if ((req->ki_flags & IOCB_DSYNC) && + !c->opts.journal_flush_disabled) + dio->op.flags |= BCH_WRITE_FLUSH; @@ -47934,98 +48921,64 @@ index 000000000000..7d45f4863469 + ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && -+ !bch2_check_range_allocated(c, inode_inum(inode), -+ dio->op.pos.offset, bio_sectors(bio), -+ dio->op.opts.data_replicas, -+ dio->op.opts.compression != 0)) ++ !bch2_dio_write_check_allocated(dio)) + goto err; + + task_io_account_write(bio->bi_iter.bi_size); + -+ if (!dio->sync && !dio->loop && dio->iter.count) { -+ struct iovec *iov = dio->inline_vecs; ++ if (unlikely(dio->iter.count) && ++ !dio->sync && ++ !dio->loop && ++ bch2_dio_write_copy_iov(dio)) ++ dio->sync = sync = true; + -+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { -+ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), -+ GFP_KERNEL); -+ if (unlikely(!iov)) { -+ dio->sync = sync = true; -+ goto do_io; -+ } -+ -+ dio->free_iov = true; -+ } -+ -+ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); -+ dio->iter.iov = iov; -+ } -+do_io: + dio->loop = true; + closure_call(&dio->op.cl, bch2_write, NULL, NULL); + -+ if (sync) -+ wait_for_completion(&dio->done); -+ else ++ if (!sync) + return -EIOCBQUEUED; -+loop: -+ i_sectors_acct(c, inode, &dio->quota_res, -+ dio->op.i_sectors_delta); -+ req->ki_pos += (u64) dio->op.written << 9; -+ dio->written += dio->op.written; + -+ spin_lock(&inode->v.i_lock); -+ if (req->ki_pos > inode->v.i_size) -+ i_size_write(&inode->v, req->ki_pos); -+ spin_unlock(&inode->v.i_lock); ++ bch2_dio_write_end(dio); + -+ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) -+ bio_for_each_segment_all(bv, bio, iter) -+ put_page(bv->bv_page); -+ bio->bi_vcnt = 0; -+ -+ if (dio->op.error) { -+ set_bit(EI_INODE_ERROR, &inode->ei_flags); -+ break; -+ } -+ -+ if (!dio->iter.count) ++ if (likely(!dio->iter.count) || dio->op.error) + break; + + bio_reset(bio, NULL, REQ_OP_WRITE); -+ reinit_completion(&dio->done); + } -+ -+ ret = dio->op.error ?: ((long) dio->written << 9); ++out: ++ return bch2_dio_write_done(dio); +err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ dio->op.error = ret; + -+ if (dio->free_iov) -+ kfree(dio->iter.iov); ++ if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; + -+ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); -+ bio_put(bio); -+ -+ /* inode->i_dio_count is our ref on inode and thus bch_fs */ -+ inode_dio_end(&inode->v); -+ -+ if (!sync) { -+ req->ki_complete(req, ret); -+ ret = -EIOCBQUEUED; + } -+ return ret; ++ goto out; +} + +static void bch2_dio_write_loop_async(struct bch_write_op *op) +{ + struct dio_write *dio = container_of(op, struct dio_write, op); ++ struct mm_struct *mm = dio->mm; + -+ if (dio->sync) -+ complete(&dio->done); -+ else -+ bch2_dio_write_loop(dio); ++ bch2_dio_write_end(dio); ++ ++ if (likely(!dio->iter.count) || dio->op.error) { ++ bch2_dio_write_done(dio); ++ return; ++ } ++ ++ bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); ++ ++ if (mm) ++ kthread_use_mm(mm); ++ bch2_dio_write_loop(dio); ++ if (mm) ++ kthread_unuse_mm(mm); +} + +static noinline @@ -48077,8 +49030,9 @@ index 000000000000..7d45f4863469 + GFP_KERNEL, + &c->dio_write_bioset); + dio = container_of(bio, struct dio_write, op.wbio.bio); -+ init_completion(&dio->done); + dio->req = req; ++ dio->mapping = mapping; ++ dio->inode = inode; + dio->mm = current->mm; + dio->loop = false; + dio->sync = is_sync_kiocb(req) || extending; @@ -48086,17 +49040,20 @@ index 000000000000..7d45f4863469 + dio->quota_res.sectors = 0; + dio->written = 0; + dio->iter = *iter; ++ dio->op.c = c; + + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + iter->count >> 9, true); + if (unlikely(ret)) + goto err_put_bio; + -+ ret = write_invalidate_inode_pages_range(mapping, -+ req->ki_pos, -+ req->ki_pos + iter->count - 1); -+ if (unlikely(ret)) -+ goto err_put_bio; ++ if (unlikely(mapping->nrpages)) { ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter->count - 1); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ } + + ret = bch2_dio_write_loop(dio); +err: @@ -48519,7 +49476,7 @@ index 000000000000..7d45f4863469 + + truncate_pagecache_range(&inode->v, offset, end - 1); + -+ if (block_start < block_end ) { ++ if (block_start < block_end) { + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode_inum(inode), @@ -48902,6 +49859,10 @@ index 000000000000..7d45f4863469 + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + ++ ret = file_modified(file); ++ if (ret) ++ goto err; ++ + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) + ret = bchfs_fallocate(inode, mode, offset, len); + else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) @@ -48912,8 +49873,7 @@ index 000000000000..7d45f4863469 + ret = bchfs_fcollapse_finsert(inode, offset, len, false); + else + ret = -EOPNOTSUPP; -+ -+ ++err: + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); + percpu_ref_put(&c->writes); @@ -48921,6 +49881,55 @@ index 000000000000..7d45f4863469 + return bch2_err_class(ret); +} + ++static int quota_reserve_range(struct bch_inode_info *inode, ++ struct quota_res *res, ++ u64 start, u64 end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u32 snapshot; ++ u64 sectors = end - start; ++ u64 pos = start; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ SPOS(inode->v.i_ino, pos, snapshot), 0); ++ ++ while (!(ret = btree_trans_too_many_iters(&trans)) && ++ (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && ++ !(ret = bkey_err(k))) { ++ if (bkey_extent_is_allocation(k.k)) { ++ u64 s = min(end, k.k->p.offset) - ++ max(start, bkey_start_offset(k.k)); ++ BUG_ON(s > sectors); ++ sectors -= s; ++ } ++ bch2_btree_iter_advance(&iter); ++ } ++ pos = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ return ret; ++ ++ return bch2_quota_reservation_add(c, inode, res, sectors, true); ++} ++ +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) @@ -48928,6 +49937,7 @@ index 000000000000..7d45f4863469 + struct bch_inode_info *src = file_bch_inode(file_src); + struct bch_inode_info *dst = file_bch_inode(file_dst); + struct bch_fs *c = src->v.i_sb->s_fs_info; ++ struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; + u64 aligned_len; + loff_t ret = 0; @@ -48948,8 +49958,6 @@ index 000000000000..7d45f4863469 + + bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + -+ file_update_time(file_dst); -+ + inode_dio_wait(&src->v); + inode_dio_wait(&dst->v); + @@ -48966,6 +49974,13 @@ index 000000000000..7d45f4863469 + if (ret) + goto err; + ++ ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, ++ (pos_dst + aligned_len) >> 9); ++ if (ret) ++ goto err; ++ ++ file_update_time(file_dst); ++ + mark_pagecache_unallocated(src, pos_src >> 9, + (pos_src + aligned_len) >> 9); + @@ -48982,8 +49997,7 @@ index 000000000000..7d45f4863469 + */ + ret = min((u64) ret << 9, (u64) len); + -+ /* XXX get a quota reservation */ -+ i_sectors_acct(c, dst, NULL, i_sectors_delta); ++ i_sectors_acct(c, dst, "a_res, i_sectors_delta); + + spin_lock(&dst->v.i_lock); + if (pos_dst + ret > dst->v.i_size) @@ -48994,6 +50008,7 @@ index 000000000000..7d45f4863469 + IS_SYNC(file_inode(file_dst))) + ret = bch2_flush_inode(c, inode_inum(dst)); +err: ++ bch2_quota_reservation_put(c, dst, "a_res); + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + return bch2_err_class(ret); @@ -49001,6 +50016,62 @@ index 000000000000..7d45f4863469 + +/* fseek: */ + ++static int page_data_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (s) ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state >= SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t bch2_seek_pagecache_data(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct folio_batch fbatch; ++ pgoff_t start_index = start_offset >> PAGE_SHIFT; ++ pgoff_t end_index = end_offset >> PAGE_SHIFT; ++ pgoff_t index = start_index; ++ unsigned i; ++ loff_t ret; ++ int offset; ++ ++ folio_batch_init(&fbatch); ++ ++ while (filemap_get_folios(vinode->i_mapping, ++ &index, end_index, &fbatch)) { ++ for (i = 0; i < folio_batch_count(&fbatch); i++) { ++ struct folio *folio = fbatch.folios[i]; ++ ++ folio_lock(folio); ++ ++ offset = page_data_offset(&folio->page, ++ folio->index == start_index ++ ? start_offset & (PAGE_SIZE - 1) ++ : 0); ++ if (offset >= 0) { ++ ret = clamp(((loff_t) folio->index << PAGE_SHIFT) + ++ offset, ++ start_offset, end_offset); ++ folio_unlock(folio); ++ folio_batch_release(&fbatch); ++ return ret; ++ } ++ ++ folio_unlock(folio); ++ } ++ folio_batch_release(&fbatch); ++ cond_resched(); ++ } ++ ++ return end_offset; ++} ++ +static loff_t bch2_seek_data(struct file *file, u64 offset) +{ + struct bch_inode_info *inode = file_bch_inode(file); @@ -49044,13 +50115,9 @@ index 000000000000..7d45f4863469 + if (ret) + return ret; + -+ if (next_data > offset) { -+ loff_t pagecache_next_data = -+ mapping_seek_hole_data(inode->v.i_mapping, offset, -+ next_data, SEEK_DATA); -+ if (pagecache_next_data >= 0) -+ next_data = min_t(u64, next_data, pagecache_next_data); -+ } ++ if (next_data > offset) ++ next_data = bch2_seek_pagecache_data(&inode->v, ++ offset, next_data); + + if (next_data >= isize) + return -ENXIO; @@ -49284,10 +50351,10 @@ index 000000000000..a8835298613a +#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c new file mode 100644 -index 000000000000..bab0707bc2f4 +index 000000000000..2bb680827b44 --- /dev/null +++ b/fs/bcachefs/fs-ioctl.c -@@ -0,0 +1,539 @@ +@@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -49316,6 +50383,9 @@ index 000000000000..bab0707bc2f4 + unsigned flags; + + unsigned projid; ++ ++ bool set_projinherit; ++ bool projinherit; +}; + +static int bch2_inode_flags_set(struct bch_inode_info *inode, @@ -49340,6 +50410,11 @@ index 000000000000..bab0707bc2f4 + (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) + return -EINVAL; + ++ if (s->set_projinherit) { ++ bi->bi_fields_set &= ~(1 << Inode_opt_project); ++ bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); ++ } ++ + bi->bi_flags &= ~s->mask; + bi->bi_flags |= newflags; + @@ -49397,6 +50472,10 @@ index 000000000000..bab0707bc2f4 + struct fsxattr fa = { 0 }; + + fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); ++ ++ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) ++ fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; ++ + fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; + + return copy_to_user(arg, &fa, sizeof(fa)); @@ -49428,6 +50507,10 @@ index 000000000000..bab0707bc2f4 + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + ++ s.set_projinherit = true; ++ s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; ++ fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; ++ + s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); + if (fa.fsx_xflags) + return -EOPNOTSUPP; @@ -49916,10 +50999,10 @@ index 000000000000..f201980ef2c3 +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 -index 000000000000..57e6e21896e1 +index 000000000000..186faa54b590 --- /dev/null +++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1942 @@ +@@ -0,0 +1,1941 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -50341,7 +51424,7 @@ index 000000000000..57e6e21896e1 + (subvol_inum) { 0 }, 0); + + if (IS_ERR(inode)) -+ return PTR_ERR(inode); ++ return bch2_err_class(PTR_ERR(inode)); + + d_instantiate(dentry, &inode->v); + return 0; @@ -50450,8 +51533,8 @@ index 000000000000..57e6e21896e1 + + inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); -+ if (unlikely(IS_ERR(inode))) -+ return PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ return bch2_err_class(PTR_ERR(inode)); + + inode_lock(&inode->v); + ret = page_symlink(&inode->v, symname, strlen(symname) + 1); @@ -50760,7 +51843,7 @@ index 000000000000..57e6e21896e1 + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); + + if (IS_ERR(inode)) -+ return PTR_ERR(inode); ++ return bch2_err_class(PTR_ERR(inode)); + + d_mark_tmpfile(dentry, &inode->v); + d_instantiate(dentry, &inode->v); @@ -51768,7 +52851,7 @@ index 000000000000..57e6e21896e1 + sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; + sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + c->vfs_sb = sb; -+ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); ++ strscpy(sb->s_id, c->name, sizeof(sb->s_id)); + + ret = super_setup_bdi(sb); + if (ret) @@ -51839,8 +52922,7 @@ index 000000000000..57e6e21896e1 +void bch2_vfs_exit(void) +{ + unregister_filesystem(&bcache_fs_type); -+ if (bch2_inode_cache) -+ kmem_cache_destroy(bch2_inode_cache); ++ kmem_cache_destroy(bch2_inode_cache); +} + +int __init bch2_vfs_init(void) @@ -52078,7 +53160,7 @@ index 000000000000..9f4b57e30e2a +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 -index 000000000000..12f2ef4417cb +index 000000000000..f4f0e0cec85d --- /dev/null +++ b/fs/bcachefs/fsck.c @@ -0,0 +1,2395 @@ @@ -52405,7 +53487,7 @@ index 000000000000..12f2ef4417cb + bch2_trans_iter_exit(trans, &iter); +err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -52590,7 +53672,7 @@ index 000000000000..12f2ef4417cb + break; + + if (i->equiv == n.equiv) { -+ bch_err(c, "adding duplicate snapshot in snapshots_seen_add()"); ++ bch_err(c, "%s(): adding duplicate snapshot", __func__); + return -EINVAL; + } + } @@ -52932,8 +54014,7 @@ index 000000000000..12f2ef4417cb + printbuf_exit(&buf); + return ret; +bad_hash: -+ if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, " -+ "hashed to %llu\n%s", ++ if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", + bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { @@ -53084,7 +54165,7 @@ index 000000000000..12f2ef4417cb +err: +fsck_err: + if (ret) -+ bch_err(c, "error from check_inode(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -53110,7 +54191,7 @@ index 000000000000..12f2ef4417cb + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + if (ret) -+ bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -53243,7 +54324,7 @@ index 000000000000..12f2ef4417cb + } +fsck_err: + if (ret) -+ bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; + return ret; @@ -53379,7 +54460,7 @@ index 000000000000..12f2ef4417cb + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error from check_extent(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -53421,7 +54502,7 @@ index 000000000000..12f2ef4417cb + snapshots_seen_exit(&s); + + if (ret) -+ bch_err(c, "error from check_extents(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -53460,7 +54541,7 @@ index 000000000000..12f2ef4417cb + } +fsck_err: + if (ret) -+ bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; + return ret; @@ -53581,7 +54662,7 @@ index 000000000000..12f2ef4417cb + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error from check_target(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -53751,7 +54832,7 @@ index 000000000000..12f2ef4417cb + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -53790,7 +54871,7 @@ index 000000000000..12f2ef4417cb + inode_walker_exit(&target); + + if (ret) -+ bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -53826,7 +54907,7 @@ index 000000000000..12f2ef4417cb + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); +fsck_err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -53858,7 +54939,7 @@ index 000000000000..12f2ef4417cb + bch2_trans_exit(&trans); + + if (ret) -+ bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -54128,7 +55209,8 @@ index 000000000000..12f2ef4417cb +{ + if (t->nr == t->size) { + size_t new_size = max_t(size_t, 128UL, t->size * 2); -+ void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL); ++ void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); ++ + if (!d) { + bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", + new_size); @@ -54493,10 +55575,10 @@ index 000000000000..264f2706b12d +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 -index 000000000000..1f2782fc5a2d +index 000000000000..1a0d2608c058 --- /dev/null +++ b/fs/bcachefs/inode.c -@@ -0,0 +1,771 @@ +@@ -0,0 +1,892 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -54559,11 +55641,10 @@ index 000000000000..1f2782fc5a2d + return bytes; +} + -+void bch2_inode_pack(struct bch_fs *c, -+ struct bkey_inode_buf *packed, -+ const struct bch_inode_unpacked *inode) ++static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) +{ -+ struct bkey_i_inode_v2 *k = &packed->inode; ++ struct bkey_i_inode_v3 *k = &packed->inode; + u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; @@ -54571,13 +55652,17 @@ index 000000000000..1f2782fc5a2d + unsigned bytes; + int ret; + -+ bkey_inode_v2_init(&packed->inode.k_i); ++ bkey_inode_v3_init(&packed->inode.k_i); + packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); -+ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); -+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); ++ packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); ++ packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); ++ SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); ++ SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); ++ + +#define x(_name, _bits) \ + nr_fields++; \ @@ -54598,7 +55683,7 @@ index 000000000000..1f2782fc5a2d + *out++ = 0; \ + } + -+ BCH_INODE_FIELDS() ++ BCH_INODE_FIELDS_v3() +#undef x + BUG_ON(out > end); + @@ -54609,7 +55694,7 @@ index 000000000000..1f2782fc5a2d + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + -+ SET_INODEv2_NR_FIELDS(&k->v, nr_fields); ++ SET_INODEv3_NR_FIELDS(&k->v, nr_fields); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bch_inode_unpacked unpacked; @@ -54619,16 +55704,25 @@ index 000000000000..1f2782fc5a2d + BUG_ON(ret); + BUG_ON(unpacked.bi_inum != inode->bi_inum); + BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); ++ BUG_ON(unpacked.bi_sectors != inode->bi_sectors); ++ BUG_ON(unpacked.bi_size != inode->bi_size); ++ BUG_ON(unpacked.bi_version != inode->bi_version); + BUG_ON(unpacked.bi_mode != inode->bi_mode); + +#define x(_name, _bits) if (unpacked._name != inode->_name) \ + panic("unpacked %llu should be %llu", \ + (u64) unpacked._name, (u64) inode->_name); -+ BCH_INODE_FIELDS() ++ BCH_INODE_FIELDS_v3() +#undef x + } +} + ++void bch2_inode_pack(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ bch2_inode_pack_inlined(packed, inode); ++} ++ +static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ @@ -54656,7 +55750,7 @@ index 000000000000..1f2782fc5a2d + unpacked->_name = field[1]; \ + in += ret; + -+ BCH_INODE_FIELDS() ++ BCH_INODE_FIELDS_v2() +#undef x + + /* XXX: signal if there were more fields than expected? */ @@ -54695,15 +55789,66 @@ index 000000000000..1f2782fc5a2d + return -1; \ + fieldnr++; + -+ BCH_INODE_FIELDS() ++ BCH_INODE_FIELDS_v2() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + -+int bch2_inode_unpack(struct bkey_s_c k, -+ struct bch_inode_unpacked *unpacked) ++static int bch2_inode_unpack_v3(struct bkey_s_c k, ++ struct bch_inode_unpacked *unpacked) ++{ ++ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); ++ const u8 *in = inode.v->fields; ++ const u8 *end = bkey_val_end(inode); ++ unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); ++ unsigned fieldnr = 0; ++ int ret; ++ u64 v[2]; ++ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); ++ unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); ++ unpacked->bi_size = le64_to_cpu(inode.v->bi_size); ++ unpacked->bi_version = le64_to_cpu(inode.v->bi_version); ++ unpacked->bi_mode = INODEv3_MODE(inode.v); ++ ++#define x(_name, _bits) \ ++ if (fieldnr < nr_fields) { \ ++ ret = bch2_varint_decode_fast(in, end, &v[0]); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ \ ++ if (_bits > 64) { \ ++ ret = bch2_varint_decode_fast(in, end, &v[1]); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ } else { \ ++ v[1] = 0; \ ++ } \ ++ } else { \ ++ v[0] = v[1] = 0; \ ++ } \ ++ \ ++ unpacked->_name = v[0]; \ ++ if (v[1] || v[0] != unpacked->_name) \ ++ return -1; \ ++ fieldnr++; ++ ++ BCH_INODE_FIELDS_v3() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ return 0; ++} ++ ++static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, ++ struct bch_inode_unpacked *unpacked) +{ + switch (k.k->type) { + case KEY_TYPE_inode: { @@ -54742,6 +55887,14 @@ index 000000000000..1f2782fc5a2d + } +} + ++int bch2_inode_unpack(struct bkey_s_c k, ++ struct bch_inode_unpacked *unpacked) ++{ ++ if (likely(k.k->type == KEY_TYPE_inode_v3)) ++ return bch2_inode_unpack_v3(k, unpacked); ++ return bch2_inode_unpack_slowpath(k, unpacked); ++} ++ +int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, @@ -54787,11 +55940,29 @@ index 000000000000..1f2782fc5a2d + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + -+ bch2_inode_pack(trans->c, inode_p, inode); ++ bch2_inode_pack_inlined(inode_p, inode); + inode_p->inode.k.p.snapshot = iter->snapshot; + return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); +} + ++struct bkey_s_c bch2_inode_to_v3(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ struct bch_inode_unpacked u; ++ struct bkey_inode_buf *inode_p; ++ int ret; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return bkey_s_c_err(PTR_ERR(inode_p)); ++ ++ ret = bch2_inode_unpack(k, &u); ++ if (ret) ++ return bkey_s_c_err(ret); ++ ++ bch2_inode_pack(inode_p, &u); ++ return bkey_i_to_s_c(&inode_p->inode.k_i); ++} ++ +static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) +{ + struct bch_inode_unpacked unpacked; @@ -54806,7 +55977,7 @@ index 000000000000..1f2782fc5a2d + return -EINVAL; + } + -+ if (bch2_inode_unpack(k, &unpacked)){ ++ if (bch2_inode_unpack(k, &unpacked)) { + prt_printf(err, "invalid variable length fields"); + return -EINVAL; + } @@ -54877,15 +56048,48 @@ index 000000000000..1f2782fc5a2d + return __bch2_inode_invalid(k, err); +} + -+static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) ++int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) +{ -+ prt_printf(out, "mode %o flags %x journal_seq %llu", ++ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); ++ ++ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*inode.v)); ++ return -EINVAL; ++ } ++ ++ if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || ++ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { ++ prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", ++ INODEv3_FIELDS_START(inode.v), ++ INODEv3_FIELDS_START_INITIAL, ++ bkey_val_u64s(inode.k)); ++ return -EINVAL; ++ } ++ ++ if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { ++ prt_printf(err, "invalid str hash type (%llu >= %u)", ++ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); ++ return -EINVAL; ++ } ++ ++ return __bch2_inode_invalid(k, err); ++} ++ ++static void __bch2_inode_unpacked_to_text(struct printbuf *out, ++ struct bch_inode_unpacked *inode) ++{ ++ prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", + inode->bi_mode, inode->bi_flags, -+ inode->bi_journal_seq); ++ inode->bi_journal_seq, ++ inode->bi_size, ++ inode->bi_sectors, ++ inode->bi_version); + +#define x(_name, _bits) \ + prt_printf(out, " "#_name " %llu", (u64) inode->_name); -+ BCH_INODE_FIELDS() ++ BCH_INODE_FIELDS_v3() +#undef x +} + @@ -54895,8 +56099,7 @@ index 000000000000..1f2782fc5a2d + __bch2_inode_unpacked_to_text(out, inode); +} + -+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) ++void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_inode_unpacked inode; + @@ -55270,50 +56473,60 @@ index 000000000000..1f2782fc5a2d +} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 -index 000000000000..2ac2fc10513b +index 000000000000..a9742bb63809 --- /dev/null +++ b/fs/bcachefs/inode.h -@@ -0,0 +1,189 @@ +@@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H + ++#include "bkey.h" +#include "opts.h" + +extern const char * const bch2_inode_opts[]; + +int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + -+#define bch2_bkey_ops_inode (struct bkey_ops) { \ ++#define bch2_bkey_ops_inode ((struct bkey_ops) { \ + .key_invalid = bch2_inode_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ -+} ++}) + -+#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ ++#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_inode_v2_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ -+} ++}) ++ ++#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ ++ .key_invalid = bch2_inode_v3_invalid, \ ++ .val_to_text = bch2_inode_to_text, \ ++ .trans_trigger = bch2_trans_mark_inode, \ ++ .atomic_trigger = bch2_mark_inode, \ ++}) + +static inline bool bkey_is_inode(const struct bkey *k) +{ + return k->type == KEY_TYPE_inode || -+ k->type == KEY_TYPE_inode_v2; ++ k->type == KEY_TYPE_inode_v2 || ++ k->type == KEY_TYPE_inode_v3; +} + +int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); +void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + -+#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ ++#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ + .key_invalid = bch2_inode_generation_invalid, \ + .val_to_text = bch2_inode_generation_to_text, \ -+} ++}) + +#if 0 +typedef struct { @@ -55327,25 +56540,28 @@ index 000000000000..2ac2fc10513b + u64 bi_inum; + u64 bi_journal_seq; + __le64 bi_hash_seed; ++ u64 bi_size; ++ u64 bi_sectors; ++ u64 bi_version; + u32 bi_flags; + u16 bi_mode; + +#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_FIELDS() ++ BCH_INODE_FIELDS_v3() +#undef x +}; + +struct bkey_inode_buf { -+ struct bkey_i_inode_v2 inode; ++ struct bkey_i_inode_v3 inode; + +#define x(_name, _bits) + 8 + _bits / 8 -+ u8 _pad[0 + BCH_INODE_FIELDS()]; ++ u8 _pad[0 + BCH_INODE_FIELDS_v3()]; +#undef x -+} __attribute__((packed, aligned(8))); ++} __packed __aligned(8); + -+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, -+ const struct bch_inode_unpacked *); ++void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); +int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); ++struct bkey_s_c bch2_inode_to_v3(struct btree_trans *, struct bkey_s_c); + +void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); + @@ -55465,10 +56681,10 @@ index 000000000000..2ac2fc10513b +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c new file mode 100644 -index 000000000000..e047ef28f127 +index 000000000000..6348bc2d12c0 --- /dev/null +++ b/fs/bcachefs/io.c -@@ -0,0 +1,2436 @@ +@@ -0,0 +1,2469 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations @@ -55487,6 +56703,7 @@ index 000000000000..e047ef28f127 +#include "checksum.h" +#include "compress.h" +#include "clock.h" ++#include "data_update.h" +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" @@ -55713,8 +56930,10 @@ index 000000000000..e047ef28f127 + s64 *i_sectors_delta_total, + bool check_enospc) +{ -+ struct btree_iter inode_iter; -+ struct bch_inode_unpacked inode_u; ++ struct btree_iter inode_iter = { NULL }; ++ struct bkey_s_c inode_k; ++ struct bkey_s_c_inode_v3 inode; ++ struct bkey_i_inode_v3 *new_inode; + struct bpos next_pos; + bool usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; @@ -55754,32 +56973,62 @@ index 000000000000..e047ef28f127 + return ret; + } + -+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, -+ BTREE_ITER_INTENT); -+ if (ret) -+ return ret; ++ bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes, ++ SPOS(0, inum.inum, iter->snapshot), ++ BTREE_ITER_INTENT|BTREE_ITER_CACHED); ++ inode_k = bch2_btree_iter_peek_slot(&inode_iter); ++ ret = bkey_err(inode_k); ++ if (unlikely(ret)) ++ goto err; + -+ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ new_i_size > inode_u.bi_size) -+ inode_u.bi_size = new_i_size; ++ ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT; ++ if (unlikely(ret)) ++ goto err; + -+ inode_u.bi_sectors += i_sectors_delta; ++ if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) { ++ inode_k = bch2_inode_to_v3(trans, inode_k); ++ ret = bkey_err(inode_k); ++ if (unlikely(ret)) ++ goto err; ++ } + -+ ret = bch2_trans_update(trans, iter, k, 0) ?: -+ bch2_inode_write(trans, &inode_iter, &inode_u) ?: ++ inode = bkey_s_c_to_inode_v3(inode_k); ++ ++ new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k)); ++ ret = PTR_ERR_OR_ZERO(new_inode); ++ if (unlikely(ret)) ++ goto err; ++ ++ bkey_reassemble(&new_inode->k_i, inode.s_c); ++ ++ if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) && ++ new_i_size > le64_to_cpu(inode.v->bi_size)) ++ new_inode->v.bi_size = cpu_to_le64(new_i_size); ++ ++ le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta); ++ ++ new_inode->k.p.snapshot = iter->snapshot; ++ ++ /* ++ * Note: ++ * We always have to do an inode updated - even when i_size/i_sectors ++ * aren't changing - for fsync to work properly; fsync relies on ++ * inode->bi_journal_seq which is updated by the trigger code: ++ */ ++ ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0) ?: ++ bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); -+ bch2_trans_iter_exit(trans, &inode_iter); -+ -+ if (ret) -+ return ret; ++ if (unlikely(ret)) ++ goto err; + + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + bch2_btree_iter_set_pos(iter, next_pos); -+ -+ return 0; ++err: ++ bch2_trans_iter_exit(trans, &inode_iter); ++ return ret; +} + +/* @@ -55862,11 +57111,10 @@ index 000000000000..e047ef28f127 + return ret; +} + -+int bch2_write_index_default(struct bch_write_op *op) ++static int bch2_write_index_default(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct bkey_buf sk; -+ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; @@ -55900,7 +57148,7 @@ index 000000000000..e047ef28f127 + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + ret = bch2_extent_update(&trans, inum, &iter, sk.k, -+ &op->res, op_journal_seq(op), ++ &op->res, &op->journal_seq, + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); @@ -55910,9 +57158,6 @@ index 000000000000..e047ef28f127 + if (ret) + break; + -+ if (ec_ob) -+ bch2_ob_add_backpointer(c, ec_ob, &sk.k->k); -+ + if (bkey_cmp(iter.pos, k->k.p) >= 0) + bch2_keylist_pop_front(&op->insert_keys); + else @@ -55981,9 +57226,9 @@ index 000000000000..e047ef28f127 + } +} + -+static void __bch2_write(struct closure *); ++static void __bch2_write(struct bch_write_op *); + -+static void bch2_write_done(struct closure *cl) ++static void __bch2_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; @@ -55997,12 +57242,23 @@ index 000000000000..e047ef28f127 + + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + -+ if (op->end_io) { -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); ++ closure_debug_destroy(cl); ++ if (op->end_io) + op->end_io(op); ++} ++ ++static __always_inline void bch2_write_done(struct bch_write_op *op) ++{ ++ if (likely(!(op->flags & BCH_WRITE_FLUSH) || op->error)) { ++ __bch2_write_done(&op->cl); ++ } else if (!(op->flags & BCH_WRITE_SYNC)) { ++ bch2_journal_flush_seq_async(&op->c->journal, ++ op->journal_seq, ++ &op->cl); ++ continue_at(&op->cl, __bch2_write_done, index_update_wq(op)); + } else { -+ closure_return(cl); ++ bch2_journal_flush_seq(&op->c->journal, op->journal_seq); ++ __bch2_write_done(&op->cl); + } +} + @@ -56041,7 +57297,7 @@ index 000000000000..e047ef28f127 + struct keylist *keys = &op->insert_keys; + struct bkey_i *k; + unsigned dev; -+ int ret; ++ int ret = 0; + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + ret = bch2_write_drop_io_error_ptrs(op); @@ -56064,7 +57320,10 @@ index 000000000000..e047ef28f127 + + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); -+ int ret = op->index_update_fn(op); ++ ++ ret = !(op->flags & BCH_WRITE_MOVE) ++ ? bch2_write_index_default(op) ++ : bch2_data_update_index_update(op); + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + BUG_ON(keylist_sectors(keys) && !ret); @@ -56074,7 +57333,7 @@ index 000000000000..e047ef28f127 + if (ret) { + bch_err_inum_ratelimited(c, op->pos.inode, + "write error while doing btree update: %s", bch2_err_str(ret)); -+ op->error = ret; ++ goto err; + } + } +out: @@ -56087,25 +57346,45 @@ index 000000000000..e047ef28f127 +err: + keys->top = keys->keys; + op->error = ret; ++ op->flags |= BCH_WRITE_DONE; + goto out; +} + +static void bch2_write_index(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; ++ struct write_point *wp = op->wp; ++ struct workqueue_struct *wq = index_update_wq(op); + -+ __bch2_write_index(op); ++ barrier(); ++ op->btree_update_ready = true; ++ queue_work(wq, &wp->index_update_work); ++} + -+ if (!(op->flags & BCH_WRITE_DONE)) { -+ continue_at(cl, __bch2_write, index_update_wq(op)); -+ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { -+ bch2_journal_flush_seq_async(&c->journal, -+ *op_journal_seq(op), -+ cl); -+ continue_at(cl, bch2_write_done, index_update_wq(op)); -+ } else { -+ continue_at_nobarrier(cl, bch2_write_done, NULL); ++void bch2_write_point_do_index_updates(struct work_struct *work) ++{ ++ struct write_point *wp = ++ container_of(work, struct write_point, index_update_work); ++ struct bch_write_op *op; ++ ++ while (1) { ++ spin_lock(&wp->writes_lock); ++ op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); ++ if (op && !op->btree_update_ready) ++ op = NULL; ++ if (op) ++ list_del(&op->wp_list); ++ spin_unlock(&wp->writes_lock); ++ ++ if (!op) ++ break; ++ ++ __bch2_write_index(op); ++ ++ if (!(op->flags & BCH_WRITE_DONE)) ++ __bch2_write(op); ++ else ++ bch2_write_done(op); + } +} + @@ -56138,12 +57417,12 @@ index 000000000000..e047ef28f127 + if (wbio->put_bio) + bio_put(bio); + -+ if (parent) ++ if (parent) { + bio_endio(&parent->bio); -+ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) -+ closure_put(cl); -+ else -+ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); ++ return; ++ } ++ ++ closure_put(cl); +} + +static void init_append_extent(struct bch_write_op *op, @@ -56401,8 +57680,7 @@ index 000000000000..e047ef28f127 + saved_iter = dst->bi_iter; + + do { -+ struct bch_extent_crc_unpacked crc = -+ (struct bch_extent_crc_unpacked) { 0 }; ++ struct bch_extent_crc_unpacked crc = { 0 }; + struct bversion version = op->version; + size_t dst_len, src_len; + @@ -56454,6 +57732,8 @@ index 000000000000..e047ef28f127 + !crc_is_compressed(crc) && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { ++ u8 compression_type = crc.compression_type; ++ u16 nonce = crc.nonce; + /* + * Note: when we're using rechecksum(), we need to be + * checksumming @src because it has all the data our @@ -56472,6 +57752,13 @@ index 000000000000..e047ef28f127 + bio_sectors(src) - (src_len >> 9), + op->csum_type)) + goto csum_err; ++ /* ++ * rchecksum_bio sets compression_type on crc from op->crc, ++ * this isn't always correct as sometimes we're changing ++ * an extent from uncompressed to incompressible. ++ */ ++ crc.compression_type = compression_type; ++ crc.nonce = nonce; + } else { + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_rechecksum_bio(c, src, version, op->crc, @@ -56542,19 +57829,18 @@ index 000000000000..e047ef28f127 + return ret; +} + -+static void __bch2_write(struct closure *cl) ++static void __bch2_write(struct bch_write_op *op) +{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; -+ struct write_point *wp; ++ struct write_point *wp = NULL; + struct bio *bio = NULL; -+ bool skip_put = true; + unsigned nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); +again: + memset(&op->failed, 0, sizeof(op->failed)); ++ op->btree_update_ready = false; + + do { + struct bkey_i *key_to_write; @@ -56564,76 +57850,60 @@ index 000000000000..e047ef28f127 + /* +1 for possible cache device: */ + if (op->open_buckets.nr + op->nr_replicas + 1 > + ARRAY_SIZE(op->open_buckets.v)) -+ goto flush_io; ++ break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)) -+ goto flush_io; ++ break; + + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ -+ wp = bch2_alloc_sectors_start(c, -+ op->target, -+ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), -+ op->write_point, -+ &op->devs_have, -+ op->nr_replicas, -+ op->nr_replicas_required, -+ op->alloc_reserve, -+ op->flags, -+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| -+ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); -+ EBUG_ON(!wp); -+ -+ if (unlikely(IS_ERR(wp))) { -+ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { -+ ret = PTR_ERR(wp); -+ goto err; ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_alloc_sectors_start_trans(&trans, ++ op->target, ++ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), ++ op->write_point, ++ &op->devs_have, ++ op->nr_replicas, ++ op->nr_replicas_required, ++ op->alloc_reserve, ++ op->flags, ++ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| ++ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ++ ? NULL : &op->cl, &wp)); ++ if (unlikely(ret)) { ++ if (unlikely(ret != -EAGAIN)) { ++ op->error = ret; ++ op->flags |= BCH_WRITE_DONE; + } + -+ goto flush_io; ++ break; + } + -+ /* -+ * It's possible for the allocator to fail, put us on the -+ * freelist waitlist, and then succeed in one of various retry -+ * paths: if that happens, we need to disable the skip_put -+ * optimization because otherwise there won't necessarily be a -+ * barrier before we free the bch_write_op: -+ */ -+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -+ skip_put = false; -+ + bch2_open_bucket_get(c, wp, &op->open_buckets); + ret = bch2_write_extent(op, wp, &bio); ++ + bch2_alloc_sectors_done(c, wp); + -+ if (ret < 0) -+ goto err; -+ -+ if (ret) { -+ skip_put = false; -+ } else { -+ /* -+ * for the skip_put optimization this has to be set -+ * before we submit the bio: -+ */ ++ if (ret < 0) { ++ op->error = ret; + op->flags |= BCH_WRITE_DONE; ++ break; + } + ++ if (!ret) ++ op->flags |= BCH_WRITE_DONE; ++ + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + -+ if (!skip_put) -+ closure_get(bio->bi_private); -+ else -+ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; ++ closure_get(bio->bi_private); + + key_to_write = (void *) (op->insert_keys.keys_p + + key_to_write_offset); @@ -56642,48 +57912,34 @@ index 000000000000..e047ef28f127 + key_to_write); + } while (ret); + -+ if (!skip_put) -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+out: -+ memalloc_nofs_restore(nofs_flags); -+ return; -+err: -+ op->error = ret; -+ op->flags |= BCH_WRITE_DONE; -+ -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ goto out; -+flush_io: + /* -+ * If the write can't all be submitted at once, we generally want to -+ * block synchronously as that signals backpressure to the caller. ++ * Sync or no? + * -+ * However, if we're running out of a workqueue, we can't block here -+ * because we'll be blocking other work items from completing: ++ * If we're running asynchronously, wne may still want to block ++ * synchronously here if we weren't able to submit all of the IO at ++ * once, as that signals backpressure to the caller. + */ -+ if (current->flags & PF_WQ_WORKER) { -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ goto out; -+ } -+ -+ closure_sync(cl); -+ -+ if (!bch2_keylist_empty(&op->insert_keys)) { ++ if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) { ++ closure_sync(&op->cl); + __bch2_write_index(op); + -+ if (op->error) { -+ op->flags |= BCH_WRITE_DONE; -+ continue_at_nobarrier(cl, bch2_write_done, NULL); -+ goto out; -+ } ++ if (!(op->flags & BCH_WRITE_DONE)) ++ goto again; ++ bch2_write_done(op); ++ } else { ++ spin_lock(&wp->writes_lock); ++ op->wp = wp; ++ list_add_tail(&op->wp_list, &wp->writes); ++ spin_unlock(&wp->writes_lock); ++ ++ continue_at(&op->cl, bch2_write_index, NULL); + } + -+ goto again; ++ memalloc_nofs_restore(nofs_flags); +} + +static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) +{ -+ struct closure *cl = &op->cl; + struct bio *bio = &op->wbio.bio; + struct bvec_iter iter; + struct bkey_i_inline_data *id; @@ -56720,10 +57976,9 @@ index 000000000000..e047ef28f127 + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + -+ continue_at_nobarrier(cl, bch2_write_index, NULL); -+ return; ++ __bch2_write_index(op); +err: -+ bch2_write_done(&op->cl); ++ bch2_write_done(op); +} + +/** @@ -56749,6 +58004,7 @@ index 000000000000..e047ef28f127 + struct bch_fs *c = op->c; + unsigned data_len; + ++ EBUG_ON(op->cl.parent); + BUG_ON(!op->nr_replicas); + BUG_ON(!op->write_point.v); + BUG_ON(!bkey_cmp(op->pos, POS_MAX)); @@ -56782,24 +58038,19 @@ index 000000000000..e047ef28f127 + return; + } + -+ continue_at_nobarrier(cl, __bch2_write, NULL); ++ __bch2_write(op); + return; +err: + bch2_disk_reservation_put(c, &op->res); + -+ if (op->end_io) { -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); ++ closure_debug_destroy(&op->cl); ++ if (op->end_io) + op->end_io(op); -+ } else { -+ closure_return(cl); -+ } +} + +/* Cache promotion on read */ + +struct promote_op { -+ struct closure cl; + struct rcu_head rcu; + u64 start_time; + @@ -56853,10 +58104,10 @@ index 000000000000..e047ef28f127 + kfree_rcu(op, rcu); +} + -+static void promote_done(struct closure *cl) ++static void promote_done(struct bch_write_op *wop) +{ + struct promote_op *op = -+ container_of(cl, struct promote_op, cl); ++ container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.op.c; + + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], @@ -56868,7 +58119,6 @@ index 000000000000..e047ef28f127 + +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +{ -+ struct closure *cl = &op->cl; + struct bio *bio = &op->write.op.wbio.bio; + + trace_and_count(op->write.op.c, read_promote, &rbio->bio); @@ -56881,9 +58131,7 @@ index 000000000000..e047ef28f127 + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + -+ closure_init(cl, NULL); -+ bch2_data_update_read_done(&op->write, rbio->pick.crc, cl); -+ closure_return_with_destructor(cl, promote_done); ++ bch2_data_update_read_done(&op->write, rbio->pick.crc); +} + +static struct promote_op *__promote_alloc(struct bch_fs *c, @@ -56948,6 +58196,7 @@ index 000000000000..e047ef28f127 + }, + btree_id, k); + BUG_ON(ret); ++ op->write.op.end_io = promote_done; + + return op; +err: @@ -57907,10 +59156,10 @@ index 000000000000..e047ef28f127 +} diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h new file mode 100644 -index 000000000000..3ae31758a01e +index 000000000000..9e6862f474d8 --- /dev/null +++ b/fs/bcachefs/io.h -@@ -0,0 +1,190 @@ +@@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_H +#define _BCACHEFS_IO_H @@ -57948,20 +59197,14 @@ index 000000000000..3ae31758a01e + BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), + BCH_WRITE_FROM_INTERNAL = (1 << 8), + BCH_WRITE_CHECK_ENOSPC = (1 << 9), ++ BCH_WRITE_SYNC = (1 << 10), ++ BCH_WRITE_MOVE = (1 << 11), + + /* Internal: */ -+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), -+ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), + BCH_WRITE_DONE = (1 << 12), + BCH_WRITE_IO_ERROR = (1 << 13), +}; + -+static inline u64 *op_journal_seq(struct bch_write_op *op) -+{ -+ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) -+ ? op->journal_seq_p : &op->journal_seq; -+} -+ +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ + return op->alloc_reserve == RESERVE_movinggc @@ -57979,8 +59222,6 @@ index 000000000000..3ae31758a01e + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); + -+int bch2_write_index_default(struct bch_write_op *); -+ +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct bch_io_opts opts) +{ @@ -58007,11 +59248,12 @@ index 000000000000..3ae31758a01e + op->journal_seq = 0; + op->new_i_size = U64_MAX; + op->i_sectors_delta = 0; -+ op->index_update_fn = bch2_write_index_default; +} + +void bch2_write(struct closure *); + ++void bch2_write_point_do_index_updates(struct work_struct *); ++ +static inline struct bch_write_bio *wbio_init(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); @@ -58103,10 +59345,10 @@ index 000000000000..3ae31758a01e +#endif /* _BCACHEFS_IO_H */ diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h new file mode 100644 -index 000000000000..78bff13d36f2 +index 000000000000..ca65f2c52c1c --- /dev/null +++ b/fs/bcachefs/io_types.h -@@ -0,0 +1,161 @@ +@@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_TYPES_H +#define _BCACHEFS_IO_TYPES_H @@ -58226,6 +59468,7 @@ index 000000000000..78bff13d36f2 + unsigned nr_replicas_required:4; + unsigned alloc_reserve:3; + unsigned incompressible:1; ++ unsigned btree_update_ready:1; + + struct bch_devs_list devs_have; + u16 target; @@ -58241,23 +59484,17 @@ index 000000000000..78bff13d36f2 + + struct write_point_specifier write_point; + ++ struct write_point *wp; ++ struct list_head wp_list; ++ + struct disk_reservation res; + + struct open_buckets open_buckets; + -+ /* -+ * If caller wants to flush but hasn't passed us a journal_seq ptr, we -+ * still need to stash the journal_seq somewhere: -+ */ -+ union { -+ u64 *journal_seq_p; -+ u64 journal_seq; -+ }; ++ u64 journal_seq; + u64 new_i_size; + s64 i_sectors_delta; + -+ int (*index_update_fn)(struct bch_write_op *); -+ + struct bch_devs_mask failed; + + struct keylist insert_keys; @@ -58270,7 +59507,7 @@ index 000000000000..78bff13d36f2 +#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 -index 000000000000..ab594623341f +index 000000000000..95c29229d3fe --- /dev/null +++ b/fs/bcachefs/journal.c @@ -0,0 +1,1436 @@ @@ -59015,7 +60252,7 @@ index 000000000000..ab594623341f + return ret; + + entry = container_of(journal_res_entry(j, &res), -+ struct jset_entry_log, entry);; ++ struct jset_entry_log, entry); + memset(entry, 0, u64s * sizeof(u64)); + entry->entry.type = BCH_JSET_ENTRY_log; + entry->entry.u64s = u64s - 1; @@ -59072,10 +60309,10 @@ index 000000000000..ab594623341f + bch2_journal_block(&c->journal); + } + -+ bu = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL); -+ ob = kzalloc(nr_want * sizeof(*ob), GFP_KERNEL); -+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); ++ ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); ++ new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); ++ new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); + if (!bu || !ob || !new_buckets || !new_bucket_seq) { + ret = -ENOMEM; + goto err_unblock; @@ -59541,7 +60778,7 @@ index 000000000000..ab594623341f + rcu_read_lock(); + s = READ_ONCE(j->reservations); + -+ prt_printf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size); ++ prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); + prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); + prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); + prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); @@ -59712,10 +60949,10 @@ index 000000000000..ab594623341f +} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h new file mode 100644 -index 000000000000..d3caa7ea7ce9 +index 000000000000..51d29a01b7b2 --- /dev/null +++ b/fs/bcachefs/journal.h -@@ -0,0 +1,521 @@ +@@ -0,0 +1,540 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_H +#define _BCACHEFS_JOURNAL_H @@ -59747,8 +60984,8 @@ index 000000000000..d3caa7ea7ce9 + * + * Synchronous updates are specified by passing a closure (@flush_cl) to + * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter -+ * down to the journalling code. That closure will will wait on the journal -+ * write to complete (via closure_wait()). ++ * down to the journalling code. That closure will wait on the journal write to ++ * complete (via closure_wait()). + * + * If the index update wasn't synchronous, the journal entry will be + * written out after 10 ms have elapsed, by default (the delay_ms field @@ -59828,6 +61065,7 @@ index 000000000000..d3caa7ea7ce9 + */ + +#include ++#include + +#include "journal_types.h" + @@ -60022,15 +61260,26 @@ index 000000000000..d3caa7ea7ce9 +{ + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); ++ unsigned u64s, offset; + + do { + old.v = new.v = v; + + /* ++ * Round up the end of the journal reservation to the next ++ * cacheline boundary: ++ */ ++ u64s = res->u64s; ++ offset = sizeof(struct jset) / sizeof(u64) + ++ new.cur_entry_offset + u64s; ++ u64s += ((offset - 1) & ((SMP_CACHE_BYTES / sizeof(u64)) - 1)) + 1; ++ ++ ++ /* + * Check if there is still room in the current journal + * entry: + */ -+ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) ++ if (new.cur_entry_offset + u64s > j->cur_entry_u64s) + return 0; + + EBUG_ON(!journal_state_count(new, new.idx)); @@ -60038,7 +61287,7 @@ index 000000000000..d3caa7ea7ce9 + if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) + return 0; + -+ new.cur_entry_offset += res->u64s; ++ new.cur_entry_offset += u64s; + journal_state_inc(&new); + + /* @@ -60055,8 +61304,15 @@ index 000000000000..d3caa7ea7ce9 + + res->ref = true; + res->idx = old.idx; ++ res->u64s = u64s; + res->offset = old.cur_entry_offset; + res->seq = le64_to_cpu(j->buf[old.idx].data->seq); ++ ++ offset = res->offset; ++ while (offset < res->offset + res->u64s) { ++ prefetchw(vstruct_idx(j->buf[res->idx].data, offset)); ++ offset += SMP_CACHE_BYTES / sizeof(u64); ++ } + return 1; +} + @@ -60239,10 +61495,10 @@ index 000000000000..d3caa7ea7ce9 +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 -index 000000000000..253a6ae20159 +index 000000000000..c4922c640653 --- /dev/null +++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1759 @@ +@@ -0,0 +1,1807 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -60262,6 +61518,23 @@ index 000000000000..253a6ae20159 + +#include + ++static struct nonce journal_nonce(const struct jset *jset) ++{ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = ((__le32 *) &jset->seq)[0], ++ [2] = ((__le32 *) &jset->seq)[1], ++ [3] = BCH_NONCE_JOURNAL, ++ }}; ++} ++ ++static bool jset_csum_good(struct bch_fs *c, struct jset *j) ++{ ++ return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && ++ !bch2_crc_cmp(j->csum, ++ csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); ++} ++ +static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) +{ + return (seq - c->journal_entries_base_seq) & (~0U >> 1); @@ -60304,8 +61577,7 @@ index 000000000000..253a6ae20159 + */ +static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct journal_ptr entry_ptr, -+ struct journal_list *jlist, struct jset *j, -+ bool bad) ++ struct journal_list *jlist, struct jset *j) +{ + struct genradix_iter iter; + struct journal_replay **_i, *i, *dup; @@ -60356,38 +61628,53 @@ index 000000000000..253a6ae20159 + */ + dup = *_i; + if (dup) { -+ if (dup->bad) { -+ /* we'll replace @dup: */ -+ } else if (bad) { -+ i = dup; -+ goto found; -+ } else { -+ fsck_err_on(bytes != vstruct_bytes(&dup->j) || -+ memcmp(j, &dup->j, bytes), c, -+ "found duplicate but non identical journal entries (seq %llu)", -+ le64_to_cpu(j->seq)); ++ if (bytes == vstruct_bytes(&dup->j) && ++ !memcmp(j, &dup->j, bytes)) { + i = dup; + goto found; + } -+ } + ++ if (!entry_ptr.csum_good) { ++ i = dup; ++ goto found; ++ } ++ ++ if (!dup->csum_good) ++ goto replace; ++ ++ fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", ++ le64_to_cpu(j->seq)); ++ i = dup; ++ goto found; ++ } ++replace: + i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + if (!i) + return -ENOMEM; + -+ i->nr_ptrs = 0; -+ i->bad = bad; ++ i->nr_ptrs = 0; ++ i->csum_good = entry_ptr.csum_good; + i->ignore = false; + memcpy(&i->j, j, bytes); ++ i->ptrs[i->nr_ptrs++] = entry_ptr; + + if (dup) { -+ i->nr_ptrs = dup->nr_ptrs; -+ memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs)); ++ if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { ++ bch_err(c, "found too many copies of journal entry %llu", ++ le64_to_cpu(i->j.seq)); ++ dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; ++ } ++ ++ /* The first ptr should represent the jset we kept: */ ++ memcpy(i->ptrs + i->nr_ptrs, ++ dup->ptrs, ++ sizeof(dup->ptrs[0]) * dup->nr_ptrs); ++ i->nr_ptrs += dup->nr_ptrs; + __journal_replay_free(c, dup); + } + -+ + *_i = i; ++ return 0; +found: + for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { + if (ptr->dev == ca->dev_idx) { @@ -60409,16 +61696,6 @@ index 000000000000..253a6ae20159 + return ret; +} + -+static struct nonce journal_nonce(const struct jset *jset) -+{ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = ((__le32 *) &jset->seq)[0], -+ [2] = ((__le32 *) &jset->seq)[1], -+ [3] = BCH_NONCE_JOURNAL, -+ }}; -+} -+ +/* this fills in a range with empty jset_entries: */ +static void journal_entry_null_range(void *start, void *end) +{ @@ -60960,12 +62237,8 @@ index 000000000000..253a6ae20159 +static int jset_validate(struct bch_fs *c, + struct bch_dev *ca, + struct jset *jset, u64 sector, -+ unsigned bucket_sectors_left, -+ unsigned sectors_read, + int write) +{ -+ size_t bytes = vstruct_bytes(jset); -+ struct bch_csum csum; + unsigned version; + int ret = 0; + @@ -60982,21 +62255,7 @@ index 000000000000..253a6ae20159 + sector, le64_to_cpu(jset->seq), + version)) { + /* don't try to continue: */ -+ return EINVAL; -+ } -+ -+ if (bytes > (sectors_read << 9) && -+ sectors_read < bucket_sectors_left) -+ return JOURNAL_ENTRY_REREAD; -+ -+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, -+ c, jset, NULL, -+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", -+ ca ? ca->name : c->name, -+ sector, le64_to_cpu(jset->seq), bytes)) { -+ ret = JOURNAL_ENTRY_BAD; -+ le32_add_cpu(&jset->u64s, -+ -((bytes - (bucket_sectors_left << 9)) / 8)); ++ return -EINVAL; + } + + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), @@ -61004,28 +62263,9 @@ index 000000000000..253a6ae20159 + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), -+ JSET_CSUM_TYPE(jset))) { -+ ret = JOURNAL_ENTRY_BAD; -+ goto csum_done; -+ } -+ -+ if (write) -+ goto csum_done; -+ -+ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); -+ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), -+ c, jset, NULL, -+ "%s sector %llu seq %llu: journal checksum bad", -+ ca ? ca->name : c->name, -+ sector, le64_to_cpu(jset->seq))) ++ JSET_CSUM_TYPE(jset))) + ret = JOURNAL_ENTRY_BAD; + -+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), -+ jset->encrypted_start, -+ vstruct_end(jset) - (void *) jset->encrypted_start); -+ bch2_fs_fatal_err_on(ret, c, -+ "error decrypting journal entry: %i", ret); -+csum_done: + /* last_seq is ignored when JSET_NO_FLUSH is true */ + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), @@ -61036,16 +62276,52 @@ index 000000000000..253a6ae20159 + jset->last_seq = jset->seq; + return JOURNAL_ENTRY_BAD; + } ++ ++ ret = jset_validate_entries(c, jset, write); +fsck_err: + return ret; +} + -+static int jset_validate_for_write(struct bch_fs *c, struct jset *jset) ++static int jset_validate_early(struct bch_fs *c, ++ struct bch_dev *ca, ++ struct jset *jset, u64 sector, ++ unsigned bucket_sectors_left, ++ unsigned sectors_read) +{ -+ unsigned sectors = vstruct_sectors(jset, c->block_bits); ++ size_t bytes = vstruct_bytes(jset); ++ unsigned version; ++ int write = READ; ++ int ret = 0; + -+ return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?: -+ jset_validate_entries(c, jset, WRITE); ++ if (le64_to_cpu(jset->magic) != jset_magic(c)) ++ return JOURNAL_ENTRY_NONE; ++ ++ version = le32_to_cpu(jset->version); ++ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, ++ c, jset, NULL, ++ "%s sector %llu seq %llu: unknown journal entry version %u", ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq), ++ version)) { ++ /* don't try to continue: */ ++ return -EINVAL; ++ } ++ ++ if (bytes > (sectors_read << 9) && ++ sectors_read < bucket_sectors_left) ++ return JOURNAL_ENTRY_REREAD; ++ ++ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, ++ c, jset, NULL, ++ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq), bytes)) ++ le32_add_cpu(&jset->u64s, ++ -((bytes - (bucket_sectors_left << 9)) / 8)); ++fsck_err: ++ return ret; +} + +struct journal_read_buf { @@ -61084,7 +62360,7 @@ index 000000000000..253a6ae20159 + unsigned sectors, sectors_read = 0; + u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), + end = offset + ca->mi.bucket_size; -+ bool saw_bad = false; ++ bool saw_bad = false, csum_good; + int ret = 0; + + pr_debug("reading %u", bucket); @@ -61123,9 +62399,8 @@ index 000000000000..253a6ae20159 + j = buf->data; + } + -+ ret = jset_validate(c, ca, j, offset, -+ end - offset, sectors_read, -+ READ); ++ ret = jset_validate_early(c, ca, j, offset, ++ end - offset, sectors_read); + switch (ret) { + case 0: + sectors = vstruct_sectors(j, c->block_bits); @@ -61141,17 +62416,13 @@ index 000000000000..253a6ae20159 + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; -+ sectors = block_sectors(c); -+ goto next_block; -+ case JOURNAL_ENTRY_BAD: -+ saw_bad = true; + /* + * On checksum error we don't really trust the size + * field of the journal entry we read, so try reading + * again at next block boundary: + */ + sectors = block_sectors(c); -+ break; ++ goto next_block; + default: + return ret; + } @@ -61167,14 +62438,25 @@ index 000000000000..253a6ae20159 + + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + ++ csum_good = jset_csum_good(c, j); ++ if (!csum_good) ++ saw_bad = true; ++ ++ ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), ++ j->encrypted_start, ++ vstruct_end(j) - (void *) j->encrypted_start); ++ bch2_fs_fatal_err_on(ret, c, ++ "error decrypting journal entry: %i", ret); ++ + mutex_lock(&jlist->lock); + ret = journal_entry_add(c, ca, (struct journal_ptr) { ++ .csum_good = csum_good, + .dev = ca->dev_idx, + .bucket = bucket, + .bucket_offset = offset - + bucket_to_sector(ca, ja->buckets[bucket]), + .sector = offset, -+ }, jlist, j, ret != 0); ++ }, jlist, j); + mutex_unlock(&jlist->lock); + + switch (ret) { @@ -61373,6 +62655,14 @@ index 000000000000..253a6ae20159 + *start_seq = le64_to_cpu(i->j.seq) + 1; + + if (!JSET_NO_FLUSH(&i->j)) { ++ int write = READ; ++ if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), ++ c, &i->j, NULL, ++ "invalid journal entry: last_seq > seq (%llu > %llu)", ++ le64_to_cpu(i->j.last_seq), ++ le64_to_cpu(i->j.seq))) ++ i->j.last_seq = i->j.seq; ++ + last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; @@ -61476,7 +62766,21 @@ index 000000000000..253a6ae20159 + if (!i || i->ignore) + continue; + -+ ret = jset_validate_entries(c, &i->j, READ); ++ for (ptr = 0; ptr < i->nr_ptrs; ptr++) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); ++ ++ if (!i->ptrs[ptr].csum_good) ++ printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n", ++ ca->name, i->ptrs[ptr].sector, ++ le64_to_cpu(i->j.seq), ++ i->csum_good ? " (had good copy on another device)" : ""); ++ } ++ ++ ret = jset_validate(c, ++ bch_dev_bkey_exists(c, i->ptrs[0].dev), ++ &i->j, ++ i->ptrs[0].sector, ++ READ); + if (ret) + goto err; + @@ -61912,7 +63216,7 @@ index 000000000000..253a6ae20159 + validate_before_checksum = true; + + if (validate_before_checksum && -+ jset_validate_for_write(c, jset)) ++ jset_validate(c, NULL, jset, 0, WRITE)) + goto err; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), @@ -61926,7 +63230,7 @@ index 000000000000..253a6ae20159 + journal_nonce(jset), jset); + + if (!validate_before_checksum && -+ jset_validate_for_write(c, jset)) ++ jset_validate(c, NULL, jset, 0, WRITE)) + goto err; + + sectors = vstruct_sectors(jset, c->block_bits); @@ -62004,7 +63308,7 @@ index 000000000000..253a6ae20159 +} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h new file mode 100644 -index 000000000000..1a91f2c0a26c +index 000000000000..2f8bbf06b289 --- /dev/null +++ b/fs/bcachefs/journal_io.h @@ -0,0 +1,59 @@ @@ -62018,6 +63322,7 @@ index 000000000000..1a91f2c0a26c + */ +struct journal_replay { + struct journal_ptr { ++ bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; @@ -62025,8 +63330,7 @@ index 000000000000..1a91f2c0a26c + } ptrs[BCH_REPLICAS_MAX]; + unsigned nr_ptrs; + -+ /* checksum error, but we may want to try using it anyways: */ -+ bool bad; ++ bool csum_good; + bool ignore; + /* must be last: */ + struct jset j; @@ -62069,7 +63373,7 @@ index 000000000000..1a91f2c0a26c +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c new file mode 100644 -index 000000000000..e69595bd1359 +index 000000000000..e873ce2a3f03 --- /dev/null +++ b/fs/bcachefs/journal_reclaim.c @@ -0,0 +1,853 @@ @@ -62307,7 +63611,7 @@ index 000000000000..e69595bd1359 + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && + (clean - clean_ondisk <= total / 8) && -+ (clean_ondisk * 2 > clean )) ++ (clean_ondisk * 2 > clean)) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); @@ -62438,7 +63742,7 @@ index 000000000000..e69595bd1359 + list_del_init(&pin->list); + + /* -+ * Unpinning a journal entry make make journal_next_bucket() succeed, if ++ * Unpinning a journal entry may make journal_next_bucket() succeed if + * writing a new last_seq will now make another bucket available: + */ + if (atomic_dec_and_test(&pin_list->count) && @@ -63020,7 +64324,7 @@ index 000000000000..0fd1af120db5 +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c new file mode 100644 -index 000000000000..cfdbd92d2164 +index 000000000000..c19db0425dd7 --- /dev/null +++ b/fs/bcachefs/journal_sb.c @@ -0,0 +1,220 @@ @@ -63057,7 +64361,7 @@ index 000000000000..cfdbd92d2164 + if (!nr) + return 0; + -+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); ++ b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); + if (!b) + return -ENOMEM; + @@ -63140,7 +64444,7 @@ index 000000000000..cfdbd92d2164 + if (!nr) + return 0; + -+ b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL); ++ b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); + if (!b) + return -ENOMEM; + @@ -63978,13 +65282,14 @@ index 000000000000..a6cdb885ad41 +#endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c new file mode 100644 -index 000000000000..cda77835b9ea +index 000000000000..5e85055b0f93 --- /dev/null +++ b/fs/bcachefs/keylist.c -@@ -0,0 +1,67 @@ +@@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" ++#include "bkey.h" +#include "keylist.h" + +int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, @@ -64051,10 +65356,10 @@ index 000000000000..cda77835b9ea +#endif diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h new file mode 100644 -index 000000000000..195799bb20bc +index 000000000000..635efb7e8228 --- /dev/null +++ b/fs/bcachefs/keylist.h -@@ -0,0 +1,76 @@ +@@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_KEYLIST_H +#define _BCACHEFS_KEYLIST_H @@ -64074,7 +65379,6 @@ index 000000000000..195799bb20bc +{ + if (l->keys_p != inline_keys) + kfree(l->keys_p); -+ bch2_keylist_init(l, inline_keys); +} + +static inline void bch2_keylist_push(struct keylist *l) @@ -64367,7 +65671,7 @@ index 000000000000..53e607d72274 +} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h new file mode 100644 -index 000000000000..3decb7b1dde2 +index 000000000000..925c29b49b86 --- /dev/null +++ b/fs/bcachefs/lru.h @@ -0,0 +1,19 @@ @@ -64378,10 +65682,10 @@ index 000000000000..3decb7b1dde2 +int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + -+#define bch2_bkey_ops_lru (struct bkey_ops) { \ ++#define bch2_bkey_ops_lru ((struct bkey_ops) { \ + .key_invalid = bch2_lru_invalid, \ + .val_to_text = bch2_lru_to_text, \ -+} ++}) + +int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c); +int bch2_lru_set(struct btree_trans *, u64, u64, u64 *); @@ -64597,10 +65901,10 @@ index 000000000000..027efaa0d575 +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c new file mode 100644 -index 000000000000..e85c3143051c +index 000000000000..5c3e378a8698 --- /dev/null +++ b/fs/bcachefs/move.c -@@ -0,0 +1,954 @@ +@@ -0,0 +1,1011 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -64656,9 +65960,8 @@ index 000000000000..e85c3143051c + struct bio_vec bi_inline_vecs[0]; +}; + -+static void move_free(struct closure *cl) ++static void move_free(struct moving_io *io) +{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); + struct moving_context *ctxt = io->write.ctxt; + struct bch_fs *c = ctxt->c; + @@ -64668,31 +65971,30 @@ index 000000000000..e85c3143051c + kfree(io); +} + -+static void move_write_done(struct closure *cl) ++static void move_write_done(struct bch_write_op *op) +{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ struct moving_io *io = container_of(op, struct moving_io, write.op); + struct moving_context *ctxt = io->write.ctxt; + + if (io->write.op.error) + ctxt->write_error = true; + + atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); -+ closure_return_with_destructor(cl, move_free); ++ move_free(io); ++ closure_put(&ctxt->cl); +} + -+static void move_write(struct closure *cl) ++static void move_write(struct moving_io *io) +{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ + if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { -+ closure_return_with_destructor(cl, move_free); ++ move_free(io); + return; + } + ++ closure_get(&io->write.ctxt->cl); + atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + -+ bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl); -+ continue_at(cl, move_write_done, NULL); ++ bch2_data_update_read_done(&io->write, io->rbio.pick.crc); +} + +static inline struct moving_io *next_pending_write(struct moving_context *ctxt) @@ -64724,7 +66026,7 @@ index 000000000000..e85c3143051c + + while ((io = next_pending_write(ctxt))) { + list_del(&io->list); -+ closure_call(&io->cl, move_write, NULL, &ctxt->cl); ++ move_write(io); + } +} + @@ -64794,7 +66096,52 @@ index 000000000000..e85c3143051c + scnprintf(stats->name, sizeof(stats->name), "%s", name); +} + ++static int bch2_extent_drop_ptrs(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct data_update_opts data_opts) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_i *n; ++ int ret; ++ ++ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(n, k); ++ ++ while (data_opts.kill_ptrs) { ++ unsigned i = 0, drop = __fls(data_opts.kill_ptrs); ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); ++ data_opts.kill_ptrs ^= 1U << drop; ++ } ++ ++ /* ++ * If the new extent no longer has any pointers, bch2_extent_normalize() ++ * will do the appropriate thing with it (turning it into a ++ * KEY_TYPE_error key, or just a discard if it was a cached extent) ++ */ ++ bch2_extent_normalize(c, bkey_i_to_s(n)); ++ ++ /* ++ * Since we're not inserting through an extent iterator ++ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), ++ * we aren't using the extent overwrite path to delete, we're ++ * just using the normal key deletion path: ++ */ ++ if (bkey_deleted(&n->k)) ++ n->k.size = 0; ++ ++ return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: ++ bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); ++} ++ +static int bch2_move_extent(struct btree_trans *trans, ++ struct btree_iter *iter, + struct moving_context *ctxt, + struct bch_io_opts io_opts, + enum btree_id btree_id, @@ -64809,6 +66156,15 @@ index 000000000000..e85c3143051c + unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + ++ bch2_data_update_opts_normalize(k, &data_opts); ++ ++ if (!data_opts.rewrite_ptrs && ++ !data_opts.extra_replicas) { ++ if (data_opts.kill_ptrs) ++ return bch2_extent_drop_ptrs(trans, iter, k, data_opts); ++ return 0; ++ } ++ + if (!percpu_ref_tryget_live(&c->writes)) + return -EROFS; + @@ -64851,6 +66207,7 @@ index 000000000000..e85c3143051c + goto err_free_pages; + + io->write.ctxt = ctxt; ++ io->write.op.end_io = move_write_done; + + atomic64_inc(&ctxt->stats->keys_moved); + atomic64_add(k.k->size, &ctxt->stats->sectors_moved); @@ -65046,11 +66403,11 @@ index 000000000000..e85c3143051c + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: -+ */ ++ */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + -+ ret2 = bch2_move_extent(&trans, ctxt, io_opts, ++ ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts, + btree_id, k, data_opts); + if (ret2) { + if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) @@ -65147,7 +66504,7 @@ index 000000000000..e85c3143051c + prt_str(&buf, "failed to evacuate bucket "); + bch2_bkey_val_to_text(&buf, c, k); + -+ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + } + } @@ -65177,7 +66534,8 @@ index 000000000000..e85c3143051c + bch2_trans_begin(&trans); + + ret = bch2_get_next_backpointer(&trans, bucket, gen, -+ &bp_offset, &bp); ++ &bp_offset, &bp, ++ BTREE_ITER_CACHED); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) @@ -65202,11 +66560,12 @@ index 000000000000..e85c3143051c + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); -+ bch2_trans_iter_exit(&trans, &iter); + + ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); -+ if (ret) ++ if (ret) { ++ bch2_trans_iter_exit(&trans, &iter); + continue; ++ } + + data_opts = _data_opts; + data_opts.target = io_opts.background_target; @@ -65218,8 +66577,10 @@ index 000000000000..e85c3143051c + i++; + } + -+ ret = bch2_move_extent(&trans, ctxt, io_opts, ++ ret = bch2_move_extent(&trans, &iter, ctxt, io_opts, + bp.btree_id, k, data_opts); ++ bch2_trans_iter_exit(&trans, &iter); ++ + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { @@ -65413,7 +66774,7 @@ index 000000000000..e85c3143051c + i++; + } + -+ return data_opts->rewrite_ptrs != 0;; ++ return data_opts->rewrite_ptrs != 0; +} + +static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, @@ -65655,7 +67016,7 @@ index 000000000000..9df6d18137a5 +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c new file mode 100644 -index 000000000000..35958c6bb4a6 +index 000000000000..044eca879afc --- /dev/null +++ b/fs/bcachefs/movinggc.c @@ -0,0 +1,285 @@ @@ -65823,7 +67184,7 @@ index 000000000000..35958c6bb4a6 + + bch2_moving_ctxt_exit(&ctxt); + -+ if (ret < 0) ++ if (ret < 0 && ret != -EROFS) + bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); + + trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); @@ -67061,10 +68422,10 @@ index 000000000000..5b8586ecb374 +#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c new file mode 100644 -index 000000000000..c12d715fb758 +index 000000000000..db8172736527 --- /dev/null +++ b/fs/bcachefs/quota.c -@@ -0,0 +1,823 @@ +@@ -0,0 +1,978 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" @@ -67162,6 +68523,113 @@ index 000000000000..c12d715fb758 +#include +#include + ++static void qc_info_to_text(struct printbuf *out, struct qc_info *i) ++{ ++ printbuf_tabstops_reset(out); ++ printbuf_tabstop_push(out, 20); ++ ++ prt_str(out, "i_fieldmask"); ++ prt_tab(out); ++ prt_printf(out, "%x", i->i_fieldmask); ++ prt_newline(out); ++ ++ prt_str(out, "i_flags"); ++ prt_tab(out); ++ prt_printf(out, "%u", i->i_flags); ++ prt_newline(out); ++ ++ prt_str(out, "i_spc_timelimit"); ++ prt_tab(out); ++ prt_printf(out, "%u", i->i_spc_timelimit); ++ prt_newline(out); ++ ++ prt_str(out, "i_ino_timelimit"); ++ prt_tab(out); ++ prt_printf(out, "%u", i->i_ino_timelimit); ++ prt_newline(out); ++ ++ prt_str(out, "i_rt_spc_timelimit"); ++ prt_tab(out); ++ prt_printf(out, "%u", i->i_rt_spc_timelimit); ++ prt_newline(out); ++ ++ prt_str(out, "i_spc_warnlimit"); ++ prt_tab(out); ++ prt_printf(out, "%u", i->i_spc_warnlimit); ++ prt_newline(out); ++ ++ prt_str(out, "i_ino_warnlimit"); ++ prt_tab(out); ++ prt_printf(out, "%u", i->i_ino_warnlimit); ++ prt_newline(out); ++ ++ prt_str(out, "i_rt_spc_warnlimit"); ++ prt_tab(out); ++ prt_printf(out, "%u", i->i_rt_spc_warnlimit); ++ prt_newline(out); ++} ++ ++static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) ++{ ++ printbuf_tabstops_reset(out); ++ printbuf_tabstop_push(out, 20); ++ ++ prt_str(out, "d_fieldmask"); ++ prt_tab(out); ++ prt_printf(out, "%x", q->d_fieldmask); ++ prt_newline(out); ++ ++ prt_str(out, "d_spc_hardlimit"); ++ prt_tab(out); ++ prt_printf(out, "%llu", q->d_spc_hardlimit); ++ prt_newline(out); ++ ++ prt_str(out, "d_spc_softlimit"); ++ prt_tab(out); ++ prt_printf(out, "%llu", q->d_spc_softlimit); ++ prt_newline(out); ++ ++ prt_str(out, "d_ino_hardlimit"); ++ prt_tab(out); ++ prt_printf(out, "%llu", q->d_ino_hardlimit); ++ prt_newline(out); ++ ++ prt_str(out, "d_ino_softlimit"); ++ prt_tab(out); ++ prt_printf(out, "%llu", q->d_ino_softlimit); ++ prt_newline(out); ++ ++ prt_str(out, "d_space"); ++ prt_tab(out); ++ prt_printf(out, "%llu", q->d_space); ++ prt_newline(out); ++ ++ prt_str(out, "d_ino_count"); ++ prt_tab(out); ++ prt_printf(out, "%llu", q->d_ino_count); ++ prt_newline(out); ++ ++ prt_str(out, "d_ino_timer"); ++ prt_tab(out); ++ prt_printf(out, "%llu", q->d_ino_timer); ++ prt_newline(out); ++ ++ prt_str(out, "d_spc_timer"); ++ prt_tab(out); ++ prt_printf(out, "%llu", q->d_spc_timer); ++ prt_newline(out); ++ ++ prt_str(out, "d_ino_warns"); ++ prt_tab(out); ++ prt_printf(out, "%i", q->d_ino_warns); ++ prt_newline(out); ++ ++ prt_str(out, "d_spc_warns"); ++ prt_tab(out); ++ prt_printf(out, "%i", q->d_spc_warns); ++ prt_newline(out); ++} ++ +static inline unsigned __next_qtype(unsigned i, unsigned qtypes) +{ + qtypes >>= i; @@ -67292,34 +68760,20 @@ index 000000000000..c12d715fb758 + if (qc->hardlimit && + qc->hardlimit < n && + !ignore_hardlimit(q)) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ + prepare_warning(qc, qtype, counter, msgs, HARDWARN); ++ return -EDQUOT; + } + + if (qc->softlimit && -+ qc->softlimit < n && -+ qc->timer && -+ ktime_get_real_seconds() >= qc->timer && -+ !ignore_hardlimit(q)) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ qc->softlimit < n) { ++ if (qc->timer == 0) { ++ qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; ++ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); ++ } else if (ktime_get_real_seconds() >= qc->timer && ++ !ignore_hardlimit(q)) { ++ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); + return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); -+ } -+ -+ if (qc->softlimit && -+ qc->softlimit < n && -+ qc->timer == 0) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); -+ -+ /* XXX is this the right one? */ -+ qc->timer = ktime_get_real_seconds() + -+ q->limits[counter].warnlimit; ++ } + } + + return 0; @@ -67429,7 +68883,8 @@ index 000000000000..c12d715fb758 + return ret; +} + -+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) ++static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, ++ struct qc_dqblk *qdq) +{ + struct bkey_s_c_quota dq; + struct bch_memquota_type *q; @@ -67458,6 +68913,15 @@ index 000000000000..c12d715fb758 + mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); + } + ++ if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) ++ mq->c[Q_SPC].timer = cpu_to_le64(qdq->d_spc_timer); ++ if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) ++ mq->c[Q_SPC].warns = cpu_to_le64(qdq->d_spc_warns); ++ if (qdq && qdq->d_fieldmask & QC_INO_TIMER) ++ mq->c[Q_INO].timer = cpu_to_le64(qdq->d_ino_timer); ++ if (qdq && qdq->d_fieldmask & QC_INO_WARNS) ++ mq->c[Q_INO].warns = cpu_to_le64(qdq->d_ino_warns); ++ + mutex_unlock(&q->lock); + } + @@ -67480,6 +68944,26 @@ index 000000000000..c12d715fb758 + mutex_init(&c->quotas[i].lock); +} + ++static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) ++{ ++ struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb); ++ ++ if (sb_quota) ++ return sb_quota; ++ ++ sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64)); ++ if (sb_quota) { ++ unsigned qtype, qc; ++ ++ for (qtype = 0; qtype < QTYP_NR; qtype++) ++ for (qc = 0; qc < Q_COUNTERS; qc++) ++ sb_quota->q[qtype].c[qc].timelimit = ++ cpu_to_le32(7 * 24 * 60 * 60); ++ } ++ ++ return sb_quota; ++} ++ +static void bch2_sb_quota_read(struct bch_fs *c) +{ + struct bch_sb_field_quota *sb_quota; @@ -67538,12 +69022,19 @@ index 000000000000..c12d715fb758 + +int bch2_fs_quota_read(struct bch_fs *c) +{ ++ struct bch_sb_field_quota *sb_quota; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + mutex_lock(&c->sb_lock); ++ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); ++ if (!sb_quota) { ++ mutex_unlock(&c->sb_lock); ++ return -BCH_ERR_ENOSPC_sb_quota; ++ } ++ + bch2_sb_quota_read(c); + mutex_unlock(&c->sb_lock); + @@ -67551,7 +69042,7 @@ index 000000000000..c12d715fb758 + + ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas, + POS_MIN, BTREE_ITER_PREFETCH, k, -+ __bch2_quota_set(c, k)) ?: ++ __bch2_quota_set(c, k, NULL)) ?: + for_each_btree_key2(&trans, iter, BTREE_ID_inodes, + POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + bch2_fs_quota_read_inode(&trans, &iter, k)); @@ -67567,6 +69058,8 @@ index 000000000000..c12d715fb758 +static int bch2_quota_enable(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; ++ struct bch_sb_field_quota *sb_quota; ++ int ret = 0; + + if (sb->s_flags & SB_RDONLY) + return -EROFS; @@ -67586,6 +69079,12 @@ index 000000000000..c12d715fb758 + return -EINVAL; + + mutex_lock(&c->sb_lock); ++ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); ++ if (!sb_quota) { ++ ret = -BCH_ERR_ENOSPC_sb_quota; ++ goto unlock; ++ } ++ + if (uflags & FS_QUOTA_UDQ_ENFD) + SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); + @@ -67596,9 +69095,10 @@ index 000000000000..c12d715fb758 + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); + + bch2_write_super(c); ++unlock: + mutex_unlock(&c->sb_lock); + -+ return 0; ++ return bch2_err_class(ret); +} + +static int bch2_quota_disable(struct super_block *sb, unsigned uflags) @@ -67710,6 +69210,15 @@ index 000000000000..c12d715fb758 + struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; + struct bch_memquota_type *q; ++ int ret = 0; ++ ++ if (0) { ++ struct printbuf buf = PRINTBUF; ++ ++ qc_info_to_text(&buf, info); ++ pr_info("setting:\n%s", buf.buf); ++ printbuf_exit(&buf); ++ } + + if (sb->s_flags & SB_RDONLY) + return -EROFS; @@ -67727,12 +69236,10 @@ index 000000000000..c12d715fb758 + q = &c->quotas[type]; + + mutex_lock(&c->sb_lock); -+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { -+ sb_quota = bch2_sb_resize_quota(&c->disk_sb, -+ sizeof(*sb_quota) / sizeof(u64)); -+ if (!sb_quota) -+ return -BCH_ERR_ENOSPC_sb_quota; ++ ret = -BCH_ERR_ENOSPC_sb_quota; ++ goto unlock; + } + + if (info->i_fieldmask & QC_SPC_TIMER) @@ -67754,9 +69261,10 @@ index 000000000000..c12d715fb758 + bch2_sb_quota_read(c); + + bch2_write_super(c); ++unlock: + mutex_unlock(&c->sb_lock); + -+ return 0; ++ return bch2_err_class(ret); +} + +/* Get/set individual quotas: */ @@ -67861,6 +69369,14 @@ index 000000000000..c12d715fb758 + struct bkey_i_quota new_quota; + int ret; + ++ if (0) { ++ struct printbuf buf = PRINTBUF; ++ ++ qc_dqblk_to_text(&buf, qdq); ++ pr_info("setting:\n%s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ + if (sb->s_flags & SB_RDONLY) + return -EROFS; + @@ -67869,7 +69385,7 @@ index 000000000000..c12d715fb758 + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: -+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); ++ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); + + return ret; +} @@ -67890,7 +69406,7 @@ index 000000000000..c12d715fb758 +#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h new file mode 100644 -index 000000000000..8c67ae1da7c7 +index 000000000000..59bed1148201 --- /dev/null +++ b/fs/bcachefs/quota.h @@ -0,0 +1,71 @@ @@ -67906,10 +69422,10 @@ index 000000000000..8c67ae1da7c7 +int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + -+#define bch2_bkey_ops_quota (struct bkey_ops) { \ ++#define bch2_bkey_ops_quota ((struct bkey_ops) { \ + .key_invalid = bch2_quota_invalid, \ + .val_to_text = bch2_quota_to_text, \ -+} ++}) + +static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) +{ @@ -68450,10 +69966,10 @@ index 000000000000..7462a92e9598 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 -index 000000000000..18f6ec5cc7d0 +index 000000000000..2bb078749b9a --- /dev/null +++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1587 @@ +@@ -0,0 +1,1606 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -68681,7 +70197,7 @@ index 000000000000..18f6ec5cc7d0 + .size = max_t(size_t, keys->size, 8) * 2, + }; + -+ new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); ++ new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); + if (!new_keys.d) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); @@ -68958,7 +70474,7 @@ index 000000000000..18f6ec5cc7d0 + + keys->size = roundup_pow_of_two(nr_keys); + -+ keys->d = kvmalloc(sizeof(keys->d[0]) * keys->size, GFP_KERNEL); ++ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + if (!keys->d) + return -ENOMEM; + @@ -69548,6 +71064,9 @@ index 000000000000..18f6ec5cc7d0 + c->opts.version_upgrade = true; + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; ++ } else if (c->sb.version < bcachefs_metadata_version_inode_v3) { ++ bch_info(c, "version prior to inode_v3, upgrade required"); ++ c->opts.version_upgrade = true; + } + } + @@ -69704,6 +71223,20 @@ index 000000000000..18f6ec5cc7d0 + goto err; + bch_verbose(c, "done checking need_discard and freespace btrees"); + ++ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { ++ err = "error creating root snapshot node"; ++ ret = bch2_fs_initialize_subvolumes(c); ++ if (ret) ++ goto err; ++ } ++ ++ bch_verbose(c, "reading snapshots table"); ++ err = "error reading snapshots table"; ++ ret = bch2_fs_snapshots_start(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "reading snapshots done"); ++ + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + + bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr); @@ -69752,7 +71285,6 @@ index 000000000000..18f6ec5cc7d0 + bch_verbose(c, "done checking alloc to lru refs"); + set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); + } else { -+ set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); + set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); @@ -69762,6 +71294,22 @@ index 000000000000..18f6ec5cc7d0 + if (c->opts.norecovery) + goto out; + ++ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { ++ err = "error creating root snapshot node"; ++ ret = bch2_fs_initialize_subvolumes(c); ++ if (ret) ++ goto err; ++ } ++ ++ bch_verbose(c, "reading snapshots table"); ++ err = "error reading snapshots table"; ++ ret = bch2_fs_snapshots_start(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "reading snapshots done"); ++ ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); ++ + bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); + err = "journal replay failed"; + ret = bch2_journal_replay(c); @@ -69777,22 +71325,6 @@ index 000000000000..18f6ec5cc7d0 + goto err; + + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { -+ bch2_fs_lazy_rw(c); -+ -+ err = "error creating root snapshot node"; -+ ret = bch2_fs_initialize_subvolumes(c); -+ if (ret) -+ goto err; -+ } -+ -+ bch_verbose(c, "reading snapshots table"); -+ err = "error reading snapshots table"; -+ ret = bch2_fs_snapshots_start(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "reading snapshots done"); -+ -+ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + /* set bi_subvol on root inode */ + err = "error upgrade root inode for subvolumes"; + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, @@ -69876,7 +71408,8 @@ index 000000000000..18f6ec5cc7d0 + set_bit(BCH_FS_FSCK_DONE, &c->flags); + bch2_flush_fsck_errs(c); + -+ if (!c->opts.keep_journal) { ++ if (!c->opts.keep_journal && ++ test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) { + bch2_journal_keys_free(&c->journal_keys); + bch2_journal_entries_free(c); + } @@ -69914,7 +71447,7 @@ index 000000000000..18f6ec5cc7d0 + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + -+ if (c->sb.version < bcachefs_metadata_version_backpointers) ++ if (c->sb.version < bcachefs_metadata_version_inode_v3) + c->opts.version_upgrade = true; + + if (c->opts.version_upgrade) { @@ -69925,6 +71458,9 @@ index 000000000000..18f6ec5cc7d0 + mutex_unlock(&c->sb_lock); + + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); ++ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); ++ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + @@ -69989,11 +71525,10 @@ index 000000000000..18f6ec5cc7d0 + goto err; + bch_verbose(c, "reading snapshots done"); + -+ bch2_inode_init(c, &root_inode, 0, 0, -+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); ++ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; -+ bch2_inode_pack(c, &packed_inode, &root_inode); ++ bch2_inode_pack(&packed_inode, &root_inode); + packed_inode.inode.k.p.snapshot = U32_MAX; + + err = "error creating root directory"; @@ -70535,7 +72070,7 @@ index 000000000000..d5c14bb2992d +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 -index 000000000000..f9848dc3eebb +index 000000000000..ce0012aa99c6 --- /dev/null +++ b/fs/bcachefs/reflink.h @@ -0,0 +1,76 @@ @@ -70549,13 +72084,13 @@ index 000000000000..f9848dc3eebb + struct bkey_s_c); +bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + -+#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ ++#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ + .key_merge = bch2_reflink_p_merge, \ + .trans_trigger = bch2_trans_mark_reflink_p, \ + .atomic_trigger = bch2_mark_reflink_p, \ -+} ++}) + +int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); @@ -70564,13 +72099,13 @@ index 000000000000..f9848dc3eebb +int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); + -+#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ ++#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_reflink_v, \ + .atomic_trigger = bch2_mark_extent, \ -+} ++}) + +int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, + int, struct printbuf *); @@ -70581,11 +72116,11 @@ index 000000000000..f9848dc3eebb + struct bkey_s_c, struct bkey_i *, + unsigned); + -+#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \ ++#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ + .key_invalid = bch2_indirect_inline_data_invalid, \ + .val_to_text = bch2_indirect_inline_data_to_text, \ + .trans_trigger = bch2_trans_mark_indirect_inline_data, \ -+} ++}) + +static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) +{ @@ -71694,14 +73229,15 @@ index 000000000000..fcf73d723035 +} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h new file mode 100644 -index 000000000000..87820b2e1ad3 +index 000000000000..cc34b3809206 --- /dev/null +++ b/fs/bcachefs/replicas.h -@@ -0,0 +1,106 @@ +@@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_H +#define _BCACHEFS_REPLICAS_H + ++#include "bkey.h" +#include "eytzinger.h" +#include "replicas_types.h" + @@ -71806,10 +73342,11 @@ index 000000000000..87820b2e1ad3 +#endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h new file mode 100644 -index 000000000000..0535b1d3760e +index 000000000000..f12a35b3dbcf --- /dev/null +++ b/fs/bcachefs/replicas_types.h -@@ -0,0 +1,10 @@ +@@ -0,0 +1,11 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_TYPES_H +#define _BCACHEFS_REPLICAS_TYPES_H + @@ -71822,7 +73359,7 @@ index 000000000000..0535b1d3760e +#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c new file mode 100644 -index 000000000000..c062edb3fbc2 +index 000000000000..dc1a27cc31cd --- /dev/null +++ b/fs/bcachefs/siphash.c @@ -0,0 +1,173 @@ @@ -71988,7 +73525,7 @@ index 000000000000..c062edb3fbc2 + + r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); + memset(ctx, 0, sizeof(*ctx)); -+ return (r); ++ return r; +} + +u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) @@ -72470,10 +74007,10 @@ index 000000000000..6178ae620ff1 +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 -index 000000000000..8c98bacca290 +index 000000000000..1133783477e1 --- /dev/null +++ b/fs/bcachefs/subvolume.c -@@ -0,0 +1,1110 @@ +@@ -0,0 +1,1111 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -72634,6 +74171,7 @@ index 000000000000..8c98bacca290 + + for (i = 0; i < 2; i++) { + int ret = snapshot_live(trans, child[i]); ++ + if (ret < 0) + return ret; + @@ -73586,7 +75124,7 @@ index 000000000000..8c98bacca290 +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 -index 000000000000..02a636644988 +index 000000000000..c694c1c24483 --- /dev/null +++ b/fs/bcachefs/subvolume.h @@ -0,0 +1,137 @@ @@ -73601,10 +75139,10 @@ index 000000000000..02a636644988 +int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, + int rw, struct printbuf *); + -+#define bch2_bkey_ops_snapshot (struct bkey_ops) { \ ++#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ + .val_to_text = bch2_snapshot_to_text, \ -+} ++}) + +int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); @@ -73701,10 +75239,10 @@ index 000000000000..02a636644988 + int rw, struct printbuf *); +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + -+#define bch2_bkey_ops_subvolume (struct bkey_ops) { \ ++#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ + .key_invalid = bch2_subvolume_invalid, \ + .val_to_text = bch2_subvolume_to_text, \ -+} ++}) + +int bch2_subvolume_get(struct btree_trans *, unsigned, + bool, int, struct bch_subvolume *); @@ -73744,10 +75282,10 @@ index 000000000000..f7562b5d51df +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c new file mode 100644 -index 000000000000..cbc5979a5181 +index 000000000000..60c1f03c05af --- /dev/null +++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1603 @@ +@@ -0,0 +1,1601 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -73850,8 +75388,7 @@ index 000000000000..cbc5979a5181 + +void bch2_free_super(struct bch_sb_handle *sb) +{ -+ if (sb->bio) -+ kfree(sb->bio); ++ kfree(sb->bio); + if (!IS_ERR_OR_NULL(sb->bdev)) + blkdev_put(sb->bdev, sb->mode); + @@ -73899,8 +75436,7 @@ index 000000000000..cbc5979a5181 + + bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); + -+ if (sb->bio) -+ kfree(sb->bio); ++ kfree(sb->bio); + sb->bio = bio; + } + @@ -75485,10 +77021,10 @@ index 000000000000..14a25f6fe29a +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 -index 000000000000..a824e16079d5 +index 000000000000..5be4c40afa47 --- /dev/null +++ b/fs/bcachefs/super.c -@@ -0,0 +1,1964 @@ +@@ -0,0 +1,1961 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and @@ -75818,26 +77354,12 @@ index 000000000000..a824e16079d5 +{ + int ret; + -+ ret = bch2_gc_thread_start(c); -+ if (ret) { -+ bch_err(c, "error starting gc thread"); -+ return ret; -+ } -+ -+ ret = bch2_copygc_start(c); -+ if (ret) { -+ bch_err(c, "error starting copygc thread"); -+ return ret; -+ } -+ + ret = bch2_rebalance_start(c); + if (ret) { + bch_err(c, "error starting rebalance thread"); + return ret; + } + -+ schedule_work(&c->ec_stripe_delete_work); -+ + return 0; +} + @@ -75876,6 +77398,20 @@ index 000000000000..a824e16079d5 + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + ++ ret = bch2_gc_thread_start(c); ++ if (ret) { ++ bch_err(c, "error starting gc thread"); ++ return ret; ++ } ++ ++ ret = bch2_copygc_start(c); ++ if (ret) { ++ bch_err(c, "error starting copygc thread"); ++ return ret; ++ } ++ ++ schedule_work(&c->ec_stripe_delete_work); ++ + bch2_do_discards(c); + bch2_do_invalidates(c); + @@ -75954,8 +77490,8 @@ index 000000000000..a824e16079d5 + kfree(c->unused_inode_hints); + free_heap(&c->copygc_heap); + -+ if (c->io_complete_wq ) -+ destroy_workqueue(c->io_complete_wq ); ++ if (c->io_complete_wq) ++ destroy_workqueue(c->io_complete_wq); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); + if (c->btree_io_complete_wq) @@ -76202,7 +77738,7 @@ index 000000000000..a824e16079d5 + goto err; + + pr_uuid(&name, c->sb.user_uuid.b); -+ strlcpy(c->name, name.buf, sizeof(c->name)); ++ strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + + ret = name.allocation_failure ? -ENOMEM : 0; @@ -76386,6 +77922,12 @@ index 000000000000..a824e16079d5 + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + ++ for (i = 0; i < BCH_TRANSACTIONS_NR; i++) { ++ mutex_lock(&c->btree_transaction_stats[i].lock); ++ bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times); ++ mutex_unlock(&c->btree_transaction_stats[i].lock); ++ } ++ + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) + ? bch2_fs_recovery(c) + : bch2_fs_initialize(c); @@ -76817,18 +78359,10 @@ index 000000000000..a824e16079d5 +static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) +{ + /* -+ * Device going read only means the copygc reserve get smaller, so we -+ * don't want that happening while copygc is in progress: -+ */ -+ bch2_copygc_stop(c); -+ -+ /* + * The allocator thread itself allocates btree nodes, so stop it first: + */ + bch2_dev_allocator_remove(c, ca); + bch2_dev_journal_stop(&c->journal, ca); -+ -+ bch2_copygc_start(c); +} + +static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) @@ -77277,9 +78811,8 @@ index 000000000000..a824e16079d5 + } + + ret = bch2_trans_mark_dev_sb(c, ca); -+ if (ret) { ++ if (ret) + goto err; -+ } + + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; @@ -77782,10 +79315,10 @@ index 000000000000..89419fc7930d +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 -index 000000000000..f1b0f001255a +index 000000000000..647d018b5ec9 --- /dev/null +++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,954 @@ +@@ -0,0 +1,963 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache sysfs interfaces @@ -77878,9 +79411,9 @@ index 000000000000..f1b0f001255a + static struct attribute sysfs_##_name = \ + { .name = #_name, .mode = _mode } + -+#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) -+#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) -+#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) ++#define write_attribute(n) __sysfs_attribute(n, 0200) ++#define read_attribute(n) __sysfs_attribute(n, 0444) ++#define rw_attribute(n) __sysfs_attribute(n, 0644) + +#define sysfs_printf(file, fmt, ...) \ +do { \ @@ -77963,7 +79496,7 @@ index 000000000000..f1b0f001255a +read_attribute(bucket_size); +read_attribute(first_bucket); +read_attribute(nbuckets); -+read_attribute(durability); ++rw_attribute(durability); +read_attribute(iodone); + +read_attribute(io_latency_read); @@ -77972,7 +79505,7 @@ index 000000000000..f1b0f001255a +read_attribute(io_latency_stats_write); +read_attribute(congested); + -+read_attribute(btree_avg_write_size); ++read_attribute(btree_write_stats); + +read_attribute(btree_cache_size); +read_attribute(compression_stats); @@ -78016,13 +79549,13 @@ index 000000000000..f1b0f001255a + +#define x(_name) \ + static struct attribute sysfs_time_stat_##_name = \ -+ { .name = #_name, .mode = S_IRUGO }; ++ { .name = #_name, .mode = 0444 }; + BCH_TIME_STATS() +#undef x + +static struct attribute sysfs_state_rw = { + .name = "state", -+ .mode = S_IRUGO ++ .mode = 0444, +}; + +static size_t bch2_btree_cache_size(struct bch_fs *c) @@ -78038,14 +79571,6 @@ index 000000000000..f1b0f001255a + return ret; +} + -+static size_t bch2_btree_avg_write_size(struct bch_fs *c) -+{ -+ u64 nr = atomic64_read(&c->btree_writes_nr); -+ u64 sectors = atomic64_read(&c->btree_writes_sectors); -+ -+ return nr ? div64_u64(sectors, nr) : 0; -+} -+ +static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) +{ + long ret = 0; @@ -78086,7 +79611,7 @@ index 000000000000..f1b0f001255a + bch2_trans_init(&trans, c, 0, 0); + + for (id = 0; id < BTREE_ID_NR; id++) { -+ if (!((1U << id) & BTREE_ID_HAS_PTRS)) ++ if (!btree_type_has_ptrs(id)) + continue; + + for_each_btree_key(&trans, iter, id, POS_MIN, @@ -78184,7 +79709,9 @@ index 000000000000..f1b0f001255a + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); -+ sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); ++ ++ if (attr == &sysfs_btree_write_stats) ++ bch2_btree_write_stats_to_text(out, c); + + sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); + @@ -78213,7 +79740,7 @@ index 000000000000..f1b0f001255a + bch2_btree_updates_to_text(out, c); + + if (attr == &sysfs_btree_cache) -+ bch2_btree_cache_to_text(out, c); ++ bch2_btree_cache_to_text(out, &c->btree_cache); + + if (attr == &sysfs_btree_key_cache) + bch2_btree_key_cache_to_text(out, &c->btree_key_cache); @@ -78342,7 +79869,7 @@ index 000000000000..f1b0f001255a +struct attribute *bch2_fs_files[] = { + &sysfs_minor, + &sysfs_btree_cache_size, -+ &sysfs_btree_avg_write_size, ++ &sysfs_btree_write_stats, + + &sysfs_promote_whole_extents, + @@ -78401,12 +79928,14 @@ index 000000000000..f1b0f001255a +SHOW(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ + return bch2_fs_to_text(out, &c->kobj, attr); +} + +STORE(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ + return bch2_fs_store(&c->kobj, attr, buf, size); +} +SYSFS_OPS(bch2_fs_internal); @@ -78695,6 +80224,19 @@ index 000000000000..f1b0f001255a + mutex_unlock(&c->sb_lock); + } + ++ if (attr == &sysfs_durability) { ++ u64 v = strtoul_or_return(buf); ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if (v != BCH_MEMBER_DURABILITY(mi)) { ++ SET_BCH_MEMBER_DURABILITY(mi, v + 1); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ + if (attr == &sysfs_label) { + char *tmp; + int ret; @@ -78796,10 +80338,10 @@ index 000000000000..222cd5062702 +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c new file mode 100644 -index 000000000000..d05886181118 +index 000000000000..43f974eb9b7e --- /dev/null +++ b/fs/bcachefs/tests.c -@@ -0,0 +1,976 @@ +@@ -0,0 +1,973 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_TESTS + @@ -78848,7 +80390,7 @@ index 000000000000..d05886181118 + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); + if (ret) { -+ bch_err(c, "update error in test_delete: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): update error in: %s", __func__, bch2_err_str(ret)); + goto err; + } + @@ -78857,7 +80399,7 @@ index 000000000000..d05886181118 + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { -+ bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): delete error (first): %s", __func__, bch2_err_str(ret)); + goto err; + } + @@ -78866,7 +80408,7 @@ index 000000000000..d05886181118 + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { -+ bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): delete error (second): %s", __func__, bch2_err_str(ret)); + goto err; + } +err: @@ -78894,7 +80436,7 @@ index 000000000000..d05886181118 + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); + if (ret) { -+ bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret)); + goto err; + } + @@ -78905,7 +80447,7 @@ index 000000000000..d05886181118 + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { -+ bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): delete error: %s", __func__, bch2_err_str(ret)); + goto err; + } +err: @@ -78938,7 +80480,7 @@ index 000000000000..d05886181118 + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); + if (ret) { -+ bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + goto err; + } + } @@ -79004,7 +80546,7 @@ index 000000000000..d05886181118 + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) { -+ bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + goto err; + } + } @@ -79071,7 +80613,7 @@ index 000000000000..d05886181118 + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); + if (ret) { -+ bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + goto err; + } + } @@ -79144,7 +80686,7 @@ index 000000000000..d05886181118 + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) { -+ bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + goto err; + } + } @@ -79258,7 +80800,7 @@ index 000000000000..d05886181118 + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) -+ bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -79357,7 +80899,7 @@ index 000000000000..d05886181118 + + ret = test_snapshot_filter(c, snapids[0], snapids[1]); + if (ret) { -+ bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): err from test_snapshot_filter: %s", __func__, bch2_err_str(ret)); + return ret; + } + @@ -79369,11 +80911,8 @@ index 000000000000..d05886181118 +static u64 test_rand(void) +{ + u64 v; -+#if 0 -+ v = prandom_u32(); -+#else ++ + prandom_bytes(&v, sizeof(v)); -+#endif + return v; +} + @@ -79394,7 +80933,7 @@ index 000000000000..d05886181118 + ret = commit_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); + if (ret) { -+ bch_err(c, "error in rand_insert: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + break; + } + } @@ -79430,7 +80969,7 @@ index 000000000000..d05886181118 + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i)); + if (ret) { -+ bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + break; + } + } @@ -79457,7 +80996,7 @@ index 000000000000..d05886181118 + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + ret = bkey_err(k); + if (ret) { -+ bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + break; + } + } @@ -79480,7 +81019,7 @@ index 000000000000..d05886181118 + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret)); ++ bch_err(trans->c, "%s(): lookup error: %s", __func__, bch2_err_str(ret)); + if (ret) + return ret; + @@ -79510,7 +81049,7 @@ index 000000000000..d05886181118 + ret = commit_do(&trans, NULL, NULL, 0, + rand_mixed_trans(&trans, &iter, &cookie, i, rand)); + if (ret) { -+ bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret)); + break; + } + } @@ -79556,7 +81095,7 @@ index 000000000000..d05886181118 + ret = commit_do(&trans, NULL, NULL, 0, + __do_delete(&trans, pos)); + if (ret) { -+ bch_err(c, "error in rand_delete: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + break; + } + } @@ -79588,7 +81127,7 @@ index 000000000000..d05886181118 + bch2_trans_update(&trans, &iter, &insert.k_i, 0); + })); + if (ret) -+ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + + bch2_trans_exit(&trans); + return ret; @@ -79607,7 +81146,7 @@ index 000000000000..d05886181118 + SPOS(0, 0, U32_MAX), 0, k, + 0); + if (ret) -+ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + + bch2_trans_exit(&trans); + return ret; @@ -79633,7 +81172,7 @@ index 000000000000..d05886181118 + bch2_trans_update(&trans, &iter, &u.k_i, 0); + })); + if (ret) -+ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + + bch2_trans_exit(&trans); + return ret; @@ -79647,7 +81186,7 @@ index 000000000000..d05886181118 + SPOS(0, 0, U32_MAX), SPOS_MAX, + 0, NULL); + if (ret) -+ bch_err(c, "error in seq_delete: %s", bch2_err_str(ret)); ++ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -79819,10 +81358,10 @@ index 000000000000..70573981b87d +#include diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 -index 000000000000..81befc433aeb +index 000000000000..62fa662019ad --- /dev/null +++ b/fs/bcachefs/util.c -@@ -0,0 +1,993 @@ +@@ -0,0 +1,1104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * random utiility code, for bcache but in theory not specific to bcache @@ -79847,6 +81386,7 @@ index 000000000000..81befc433aeb +#include +#include +#include ++#include + +#include "eytzinger.h" +#include "util.h" @@ -80121,6 +81661,26 @@ index 000000000000..81befc433aeb + console_unlock(); +} + ++int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task) ++{ ++ unsigned long entries[32]; ++ unsigned i, nr_entries; ++ int ret; ++ ++ ret = down_read_killable(&task->signal->exec_update_lock); ++ if (ret) ++ return ret; ++ ++ nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0); ++ for (i = 0; i < nr_entries; i++) { ++ prt_printf(out, "[<0>] %pB", (void *)entries[i]); ++ prt_newline(out); ++ } ++ ++ up_read(&task->signal->exec_update_lock); ++ return 0; ++} ++ +/* time stats: */ + +static void bch2_time_stats_update_one(struct time_stats *stats, @@ -80128,38 +81688,44 @@ index 000000000000..81befc433aeb +{ + u64 duration, freq; + -+ duration = time_after64(end, start) -+ ? end - start : 0; -+ freq = time_after64(end, stats->last_event) -+ ? end - stats->last_event : 0; ++ if (time_after64(end, start)) { ++ duration = end - start; ++ stats->duration_stats = mean_and_variance_update(stats->duration_stats, ++ duration); ++ stats->duration_stats_weighted = mean_and_variance_weighted_update( ++ stats->duration_stats_weighted, ++ duration); ++ stats->max_duration = max(stats->max_duration, duration); ++ stats->min_duration = min(stats->min_duration, duration); ++ bch2_quantiles_update(&stats->quantiles, duration); ++ } + -+ stats->count++; -+ -+ stats->average_duration = stats->average_duration -+ ? ewma_add(stats->average_duration, duration, 6) -+ : duration; -+ -+ stats->average_frequency = stats->average_frequency -+ ? ewma_add(stats->average_frequency, freq, 6) -+ : freq; -+ -+ stats->max_duration = max(stats->max_duration, duration); -+ -+ stats->last_event = end; -+ -+ bch2_quantiles_update(&stats->quantiles, duration); ++ if (time_after64(end, stats->last_event)) { ++ freq = end - stats->last_event; ++ stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq); ++ stats->freq_stats_weighted = mean_and_variance_weighted_update( ++ stats->freq_stats_weighted, ++ freq); ++ stats->max_freq = max(stats->max_freq, freq); ++ stats->min_freq = min(stats->min_freq, freq); ++ stats->last_event = end; ++ } +} + +void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + ++ WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, ++ "time_stats: min_duration = %llu, min_freq = %llu", ++ stats->min_duration, stats->min_freq); ++ + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + bch2_time_stats_update_one(stats, start, end); + -+ if (stats->average_frequency < 32 && -+ stats->count > 1024) ++ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && ++ stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct time_stat_buffer, + GFP_ATOMIC); @@ -80194,12 +81760,15 @@ index 000000000000..81befc433aeb + +static const struct time_unit { + const char *name; -+ u32 nsecs; ++ u64 nsecs; +} time_units[] = { -+ { "ns", 1 }, -+ { "us", NSEC_PER_USEC }, -+ { "ms", NSEC_PER_MSEC }, -+ { "sec", NSEC_PER_SEC }, ++ { "ns", 1 }, ++ { "us", NSEC_PER_USEC }, ++ { "ms", NSEC_PER_MSEC }, ++ { "s", NSEC_PER_SEC }, ++ { "m", NSEC_PER_SEC * 60}, ++ { "h", NSEC_PER_SEC * 3600}, ++ { "eon", U64_MAX }, +}; + +static const struct time_unit *pick_time_units(u64 ns) @@ -80219,38 +81788,117 @@ index 000000000000..81befc433aeb +{ + const struct time_unit *u = pick_time_units(ns); + -+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); ++ prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); ++ prt_tab_rjust(out); ++ prt_printf(out, "%s", u->name); ++} ++ ++#define TABSTOP_SIZE 12 ++ ++static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) ++{ ++ prt_str(out, name); ++ prt_tab(out); ++ pr_time_units(out, ns); ++ prt_newline(out); +} + +void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) +{ + const struct time_unit *u; -+ u64 freq = READ_ONCE(stats->average_frequency); -+ u64 q, last_q = 0; ++ s64 f_mean = 0, d_mean = 0; ++ u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; + int i; ++ /* ++ * avoid divide by zero ++ */ ++ if (stats->freq_stats.n) { ++ f_mean = mean_and_variance_get_mean(stats->freq_stats); ++ f_stddev = mean_and_variance_get_stddev(stats->freq_stats); ++ d_mean = mean_and_variance_get_mean(stats->duration_stats); ++ d_stddev = mean_and_variance_get_stddev(stats->duration_stats); ++ } + -+ prt_printf(out, "count:\t\t%llu", -+ stats->count); -+ prt_newline(out); -+ prt_printf(out, "rate:\t\t%llu/sec", -+ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); ++ printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); ++ prt_printf(out, "count:"); ++ prt_tab(out); ++ prt_printf(out, "%llu ", ++ stats->duration_stats.n); ++ printbuf_tabstop_pop(out); + prt_newline(out); + -+ prt_printf(out, "frequency:\t"); -+ pr_time_units(out, freq); ++ printbuf_tabstops_reset(out); + -+ prt_newline(out); -+ prt_printf(out, "avg duration:\t"); -+ pr_time_units(out, stats->average_duration); ++ printbuf_tabstop_push(out, out->indent + 20); ++ printbuf_tabstop_push(out, TABSTOP_SIZE + 2); ++ printbuf_tabstop_push(out, 0); ++ printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + ++ prt_tab(out); ++ prt_printf(out, "since mount"); ++ prt_tab_rjust(out); ++ prt_tab(out); ++ prt_printf(out, "recent"); ++ prt_tab_rjust(out); + prt_newline(out); -+ prt_printf(out, "max duration:\t"); -+ pr_time_units(out, stats->max_duration); ++ ++ printbuf_tabstops_reset(out); ++ printbuf_tabstop_push(out, out->indent + 20); ++ printbuf_tabstop_push(out, TABSTOP_SIZE); ++ printbuf_tabstop_push(out, 2); ++ printbuf_tabstop_push(out, TABSTOP_SIZE); ++ ++ prt_printf(out, "duration of events"); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ ++ pr_name_and_units(out, "min:", stats->min_duration); ++ pr_name_and_units(out, "max:", stats->max_duration); ++ ++ prt_printf(out, "mean:"); ++ prt_tab(out); ++ pr_time_units(out, d_mean); ++ prt_tab(out); ++ pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); ++ prt_newline(out); ++ ++ prt_printf(out, "stddev:"); ++ prt_tab(out); ++ pr_time_units(out, d_stddev); ++ prt_tab(out); ++ pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); ++ ++ printbuf_indent_sub(out, 2); ++ prt_newline(out); ++ ++ prt_printf(out, "time between events"); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ ++ pr_name_and_units(out, "min:", stats->min_freq); ++ pr_name_and_units(out, "max:", stats->max_freq); ++ ++ prt_printf(out, "mean:"); ++ prt_tab(out); ++ pr_time_units(out, f_mean); ++ prt_tab(out); ++ pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); ++ prt_newline(out); ++ ++ prt_printf(out, "stddev:"); ++ prt_tab(out); ++ pr_time_units(out, f_stddev); ++ prt_tab(out); ++ pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); ++ ++ printbuf_indent_sub(out, 2); ++ prt_newline(out); ++ ++ printbuf_tabstops_reset(out); + + i = eytzinger0_first(NR_QUANTILES); + u = pick_time_units(stats->quantiles.entries[i].m); + -+ prt_newline(out); + prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; @@ -80272,6 +81920,10 @@ index 000000000000..81befc433aeb +void bch2_time_stats_init(struct time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); ++ stats->duration_stats_weighted.w = 8; ++ stats->freq_stats_weighted.w = 8; ++ stats->min_duration = U64_MAX; ++ stats->min_freq = U64_MAX; + spin_lock_init(&stats->lock); +} + @@ -80498,8 +82150,6 @@ index 000000000000..81befc433aeb + } +} + -+#include "eytzinger.h" -+ +static int alignment_ok(const void *base, size_t align) +{ + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || @@ -80818,10 +82468,10 @@ index 000000000000..81befc433aeb +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 -index 000000000000..aa8b416a919a +index 000000000000..846e6024a80b --- /dev/null +++ b/fs/bcachefs/util.h -@@ -0,0 +1,787 @@ +@@ -0,0 +1,793 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H @@ -80842,6 +82492,7 @@ index 000000000000..aa8b416a919a +#include +#include +#include ++#include + +struct closure; + @@ -81180,6 +82831,7 @@ index 000000000000..aa8b416a919a +void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); + +void bch2_print_string_as_lines(const char *prefix, const char *lines); ++int bch2_prt_backtrace(struct printbuf *, struct task_struct *); + +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) @@ -81203,14 +82855,18 @@ index 000000000000..aa8b416a919a + +struct time_stats { + spinlock_t lock; -+ u64 count; + /* all fields are in nanoseconds */ -+ u64 average_duration; -+ u64 average_frequency; + u64 max_duration; ++ u64 min_duration; ++ u64 max_freq; ++ u64 min_freq; + u64 last_event; + struct quantiles quantiles; + ++ struct mean_and_variance duration_stats; ++ struct mean_and_variance_weighted duration_stats_weighted; ++ struct mean_and_variance freq_stats; ++ struct mean_and_variance_weighted freq_stats_weighted; + struct time_stat_buffer __percpu *buffer; +}; + @@ -81824,10 +83480,10 @@ index 000000000000..53a694d71967 +#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c new file mode 100644 -index 000000000000..6a5be6c9e1ca +index 000000000000..4fc1c3afab69 --- /dev/null +++ b/fs/bcachefs/xattr.c -@@ -0,0 +1,650 @@ +@@ -0,0 +1,654 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -82201,8 +83857,10 @@ index 000000000000..6a5be6c9e1ca +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ int ret; + -+ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); ++ ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags); ++ return bch2_err_class(ret); +} + +static int bch2_xattr_set_handler(const struct xattr_handler *handler, @@ -82214,11 +83872,13 @@ index 000000000000..6a5be6c9e1ca + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); ++ int ret; + -+ return bch2_trans_do(c, NULL, NULL, 0, ++ ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_xattr_set(&trans, inode_inum(inode), &hash, + name, value, size, + handler->flags, flags)); ++ return bch2_err_class(ret); +} + +static const struct xattr_handler bch_xattr_user_handler = { @@ -82480,7 +84140,7 @@ index 000000000000..6a5be6c9e1ca +} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h new file mode 100644 -index 000000000000..66d7a1e30350 +index 000000000000..03f1b73fc926 --- /dev/null +++ b/fs/bcachefs/xattr.h @@ -0,0 +1,50 @@ @@ -82495,10 +84155,10 @@ index 000000000000..66d7a1e30350 +int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + -+#define bch2_bkey_ops_xattr (struct bkey_ops) { \ ++#define bch2_bkey_ops_xattr ((struct bkey_ops) { \ + .key_invalid = bch2_xattr_invalid, \ + .val_to_text = bch2_xattr_to_text, \ -+} ++}) + +static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) +{ @@ -82617,7 +84277,7 @@ index bb0c4d0038db..d77832eb0785 100644 } EXPORT_SYMBOL(d_tmpfile); diff --git a/fs/inode.c b/fs/inode.c -index ba1de23c13c1..cb7969ab3633 100644 +index b608528efd3a..56756dc56346 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -56,8 +56,23 @@ @@ -82646,7 +84306,7 @@ index ba1de23c13c1..cb7969ab3633 100644 /* * Empty aops. Can be used for the cases where the user does not -@@ -417,7 +432,7 @@ EXPORT_SYMBOL(address_space_init_once); +@@ -416,7 +431,7 @@ EXPORT_SYMBOL(address_space_init_once); void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); @@ -82655,7 +84315,7 @@ index ba1de23c13c1..cb7969ab3633 100644 INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); -@@ -506,14 +521,15 @@ static inline void inode_sb_list_del(struct inode *inode) +@@ -505,14 +520,15 @@ static inline void inode_sb_list_del(struct inode *inode) } } @@ -82678,7 +84338,7 @@ index ba1de23c13c1..cb7969ab3633 100644 } /** -@@ -526,13 +542,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) +@@ -525,13 +541,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { @@ -82696,7 +84356,7 @@ index ba1de23c13c1..cb7969ab3633 100644 } EXPORT_SYMBOL(__insert_inode_hash); -@@ -544,11 +560,44 @@ EXPORT_SYMBOL(__insert_inode_hash); +@@ -543,11 +559,44 @@ EXPORT_SYMBOL(__insert_inode_hash); */ void __remove_inode_hash(struct inode *inode) { @@ -82746,7 +84406,7 @@ index ba1de23c13c1..cb7969ab3633 100644 } EXPORT_SYMBOL(__remove_inode_hash); -@@ -898,26 +947,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) +@@ -897,26 +946,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } @@ -82779,7 +84439,7 @@ index ba1de23c13c1..cb7969ab3633 100644 goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { -@@ -936,19 +987,20 @@ static struct inode *find_inode(struct super_block *sb, +@@ -935,19 +986,20 @@ static struct inode *find_inode(struct super_block *sb, * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, @@ -82803,7 +84463,7 @@ index ba1de23c13c1..cb7969ab3633 100644 goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { -@@ -1156,25 +1208,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories); +@@ -1155,25 +1207,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * @@ -82835,7 +84495,7 @@ index ba1de23c13c1..cb7969ab3633 100644 if (IS_ERR(old)) return NULL; wait_on_inode(old); -@@ -1196,7 +1248,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, +@@ -1195,7 +1247,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; @@ -82844,7 +84504,7 @@ index ba1de23c13c1..cb7969ab3633 100644 spin_unlock(&inode->i_lock); /* -@@ -1206,7 +1258,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, +@@ -1205,7 +1257,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); unlock: @@ -82853,7 +84513,7 @@ index ba1de23c13c1..cb7969ab3633 100644 return inode; } -@@ -1267,12 +1319,12 @@ EXPORT_SYMBOL(iget5_locked); +@@ -1266,12 +1318,12 @@ EXPORT_SYMBOL(iget5_locked); */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { @@ -82870,7 +84530,7 @@ index ba1de23c13c1..cb7969ab3633 100644 if (inode) { if (IS_ERR(inode)) return NULL; -@@ -1288,17 +1340,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) +@@ -1287,17 +1339,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) if (inode) { struct inode *old; @@ -82892,7 +84552,7 @@ index ba1de23c13c1..cb7969ab3633 100644 /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents -@@ -1311,7 +1363,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) +@@ -1310,7 +1362,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) * us. Use the old inode instead of the one we just * allocated. */ @@ -82901,7 +84561,7 @@ index ba1de23c13c1..cb7969ab3633 100644 destroy_inode(inode); if (IS_ERR(old)) return NULL; -@@ -1335,10 +1387,11 @@ EXPORT_SYMBOL(iget_locked); +@@ -1334,10 +1386,11 @@ EXPORT_SYMBOL(iget_locked); */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { @@ -82915,7 +84575,7 @@ index ba1de23c13c1..cb7969ab3633 100644 if (inode->i_ino == ino && inode->i_sb == sb) return 0; } -@@ -1422,12 +1475,12 @@ EXPORT_SYMBOL(igrab); +@@ -1421,12 +1474,12 @@ EXPORT_SYMBOL(igrab); struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { @@ -82932,7 +84592,7 @@ index ba1de23c13c1..cb7969ab3633 100644 return IS_ERR(inode) ? NULL : inode; } -@@ -1477,12 +1530,12 @@ EXPORT_SYMBOL(ilookup5); +@@ -1476,12 +1529,12 @@ EXPORT_SYMBOL(ilookup5); */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { @@ -82949,7 +84609,7 @@ index ba1de23c13c1..cb7969ab3633 100644 if (inode) { if (IS_ERR(inode)) -@@ -1526,12 +1579,13 @@ struct inode *find_inode_nowait(struct super_block *sb, +@@ -1525,12 +1578,13 @@ struct inode *find_inode_nowait(struct super_block *sb, void *), void *data) { @@ -82966,7 +84626,7 @@ index ba1de23c13c1..cb7969ab3633 100644 if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); -@@ -1542,7 +1596,7 @@ struct inode *find_inode_nowait(struct super_block *sb, +@@ -1541,7 +1595,7 @@ struct inode *find_inode_nowait(struct super_block *sb, goto out; } out: @@ -82975,7 +84635,7 @@ index ba1de23c13c1..cb7969ab3633 100644 return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); -@@ -1571,13 +1625,14 @@ EXPORT_SYMBOL(find_inode_nowait); +@@ -1570,13 +1624,14 @@ EXPORT_SYMBOL(find_inode_nowait); struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { @@ -82992,7 +84652,7 @@ index ba1de23c13c1..cb7969ab3633 100644 if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) -@@ -1609,13 +1664,14 @@ EXPORT_SYMBOL(find_inode_rcu); +@@ -1608,13 +1663,14 @@ EXPORT_SYMBOL(find_inode_rcu); struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { @@ -83009,7 +84669,7 @@ index ba1de23c13c1..cb7969ab3633 100644 if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) -@@ -1629,39 +1685,42 @@ int insert_inode_locked(struct inode *inode) +@@ -1628,39 +1684,42 @@ int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; @@ -83065,7 +84725,7 @@ index ba1de23c13c1..cb7969ab3633 100644 wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); -@@ -2217,17 +2276,18 @@ EXPORT_SYMBOL(inode_needs_sync); +@@ -2216,17 +2275,18 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ @@ -83087,7 +84747,7 @@ index ba1de23c13c1..cb7969ab3633 100644 } static __initdata unsigned long ihash_entries; -@@ -2253,7 +2313,7 @@ void __init inode_init_early(void) +@@ -2252,7 +2312,7 @@ void __init inode_init_early(void) inode_hashtable = alloc_large_system_hash("Inode-cache", @@ -83096,7 +84756,7 @@ index ba1de23c13c1..cb7969ab3633 100644 ihash_entries, 14, HASH_EARLY | HASH_ZERO, -@@ -2279,7 +2339,7 @@ void __init inode_init(void) +@@ -2278,7 +2338,7 @@ void __init inode_init(void) inode_hashtable = alloc_large_system_hash("Inode-cache", @@ -83106,7 +84766,7 @@ index ba1de23c13c1..cb7969ab3633 100644 14, HASH_ZERO, diff --git a/include/linux/bio.h b/include/linux/bio.h -index ca22b06700a9..5692e54eb446 100644 +index 2c5806997bbf..85801ddacfb9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -480,7 +480,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, @@ -83298,7 +84958,7 @@ index fe848901fcc3..5a3cc0e1da9b 100644 * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) diff --git a/include/linux/fs.h b/include/linux/fs.h -index 9eced4cc286e..612ac13ace17 100644 +index 56a4b4b02477..70d7b30a35c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -645,7 +645,8 @@ struct inode { @@ -83329,7 +84989,7 @@ index 9eced4cc286e..612ac13ace17 100644 } /* -@@ -3112,7 +3113,7 @@ static inline void insert_inode_hash(struct inode *inode) +@@ -3113,7 +3114,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { @@ -83563,6 +85223,182 @@ index d22430840b53..506e769b4a95 100644 #ifdef CONFIG_LOCK_STAT unsigned long contention_point[LOCKSTAT_POINTS]; +diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h +new file mode 100644 +index 000000000000..3d62abe75976 +--- /dev/null ++++ b/include/linux/mean_and_variance.h +@@ -0,0 +1,170 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MEAN_AND_VARIANCE_H_ ++#define MEAN_AND_VARIANCE_H_ ++ ++#include ++#include ++#include ++#include ++ ++#define SQRT_U64_MAX 4294967295ULL ++ ++ ++#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) ++ ++typedef unsigned __int128 u128; ++ ++static inline u128 u64_to_u128(u64 a) ++{ ++ return (u128)a; ++} ++ ++static inline u64 u128_to_u64(u128 a) ++{ ++ return (u64)a; ++} ++ ++static inline u64 u128_shr64_to_u64(u128 a) ++{ ++ return (u64)(a >> 64); ++} ++ ++static inline u128 u128_add(u128 a, u128 b) ++{ ++ return a + b; ++} ++ ++static inline u128 u128_sub(u128 a, u128 b) ++{ ++ return a - b; ++} ++ ++static inline u128 u128_shl(u128 i, s8 shift) ++{ ++ return i << shift; ++} ++ ++static inline u128 u128_shl64_add(u64 a, u64 b) ++{ ++ return ((u128)a << 64) + b; ++} ++ ++static inline u128 u128_square(u64 i) ++{ ++ return i*i; ++} ++ ++#else ++ ++typedef struct { ++ u64 hi, lo; ++} u128; ++ ++static inline u128 u64_to_u128(u64 a) ++{ ++ return (u128){ .lo = a }; ++} ++ ++static inline u64 u128_to_u64(u128 a) ++{ ++ return a.lo; ++} ++ ++static inline u64 u128_shr64_to_u64(u128 a) ++{ ++ return a.hi; ++} ++ ++static inline u128 u128_add(u128 a, u128 b) ++{ ++ u128 c; ++ ++ c.lo = a.lo + b.lo; ++ c.hi = a.hi + b.hi + (c.lo < a.lo); ++ return c; ++} ++ ++static inline u128 u128_sub(u128 a, u128 b) ++{ ++ u128 c; ++ ++ c.lo = a.lo - b.lo; ++ c.hi = a.hi - b.hi - (c.lo > a.lo); ++ return c; ++} ++ ++static inline u128 u128_shl(u128 i, s8 shift) ++{ ++ u128 r; ++ ++ r.lo = i.lo << shift; ++ if (shift < 64) ++ r.hi = (i.hi << shift) | (i.lo >> (64 - shift)); ++ else { ++ r.hi = i.lo << (shift - 64); ++ r.lo = 0; ++ } ++ return r; ++} ++ ++static inline u128 u128_shl64_add(u64 a, u64 b) ++{ ++ return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b)); ++} ++ ++static inline u128 u128_square(u64 i) ++{ ++ u128 r; ++ u64 h = i >> 32, l = i & (u64)U32_MAX; ++ ++ r = u128_shl(u64_to_u128(h*h), 64); ++ r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); ++ r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); ++ r = u128_add(r, u64_to_u128(l*l)); ++ return r; ++} ++ ++#endif ++ ++static inline u128 u128_div(u128 n, u64 d) ++{ ++ u128 r; ++ u64 rem; ++ u64 hi = u128_shr64_to_u64(n); ++ u64 lo = u128_to_u64(n); ++ u64 h = hi & ((u64)U32_MAX << 32); ++ u64 l = (hi & (u64)U32_MAX) << 32; ++ ++ r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); ++ r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); ++ r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); ++ return r; ++} ++ ++struct mean_and_variance { ++ s64 n; ++ s64 sum; ++ u128 sum_squares; ++}; ++ ++/* expontentially weighted variant */ ++struct mean_and_variance_weighted { ++ bool init; ++ u8 w; ++ s64 mean; ++ u64 variance; ++}; ++ ++inline s64 fast_divpow2(s64 n, u8 d); ++ ++struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1); ++ s64 mean_and_variance_get_mean(struct mean_and_variance s); ++ u64 mean_and_variance_get_variance(struct mean_and_variance s1); ++ u32 mean_and_variance_get_stddev(struct mean_and_variance s); ++ ++struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1); ++ s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); ++ u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); ++ u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); ++ ++#endif // MEAN_AND_VAIRANCE_H_ diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h new file mode 100644 index 000000000000..f39d8edfba02 @@ -83892,7 +85728,7 @@ index 000000000000..24e62e56d18c + +#endif /* _LINUX_PRINTBUF_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h -index e7b2f8a5c711..4def6fae37b8 100644 +index 8d82d6d32670..24ffd47480ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -859,6 +859,7 @@ struct task_struct { @@ -84406,10 +86242,10 @@ index 4d72258d42fd..52e0f1d283b9 100644 unsigned int flags, const char *only) { diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h -index 8401dec93c15..fa734bc7af1d 100644 +index 20749bd9db71..93409ba238f8 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h -@@ -496,7 +496,7 @@ struct dynevent_cmd; +@@ -497,7 +497,7 @@ struct dynevent_cmd; typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd); struct dynevent_cmd { @@ -84504,10 +86340,10 @@ index 096d48aa3437..8d11e2e4ddc8 100644 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h new file mode 100644 -index 000000000000..d3d9e965e702 +index 000000000000..6bc361ae7d7a --- /dev/null +++ b/include/trace/events/bcachefs.h -@@ -0,0 +1,1101 @@ +@@ -0,0 +1,1105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcachefs @@ -84854,25 +86690,29 @@ index 000000000000..d3d9e965e702 +TRACE_EVENT(btree_reserve_get_fail, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, -+ size_t required), -+ TP_ARGS(trans_fn, caller_ip, required), ++ size_t required, ++ int ret), ++ TP_ARGS(trans_fn, caller_ip, required, ret), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(size_t, required ) ++ __array(char, ret, 32 ) + ), + + TP_fast_assign( -+ strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->required = required; ++ strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); + ), + -+ TP_printk("%s %pS required %zu", ++ TP_printk("%s %pS required %zu ret %s", + __entry->trans_fn, + (void *) __entry->caller_ip, -+ __entry->required) ++ __entry->required, ++ __entry->ret) +); + +DEFINE_EVENT(btree_node, btree_node_compact, @@ -84921,7 +86761,7 @@ index 000000000000..d3d9e965e702 + TP_fast_assign( + struct btree *b = btree_path_node(path, level); + -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->level = path->level; @@ -84972,7 +86812,7 @@ index 000000000000..d3d9e965e702 + TP_fast_assign( + struct six_lock_count c; + -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->level = level; @@ -85034,7 +86874,7 @@ index 000000000000..d3d9e965e702 + + TP_fast_assign( + __entry->dev = ca->dev; -+ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); ++ strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->user = user; + __entry->bucket = bucket; + ), @@ -85078,7 +86918,7 @@ index 000000000000..d3d9e965e702 + + TP_fast_assign( + __entry->dev = ca->dev; -+ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); ++ strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->free = free; + __entry->avail = avail; + __entry->copygc_wait_amount = copygc_wait_amount; @@ -85088,7 +86928,7 @@ index 000000000000..d3d9e965e702 + __entry->need_journal_commit = need_journal_commit; + __entry->nouse = nouse; + __entry->nonblocking = nonblocking; -+ strlcpy(__entry->err, err, sizeof(__entry->err)); ++ strscpy(__entry->err, err, sizeof(__entry->err)); + ), + + TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s", @@ -85126,7 +86966,7 @@ index 000000000000..d3d9e965e702 + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->discarded = discarded; -+ strlcpy(__entry->err, err, sizeof(__entry->err)); ++ strscpy(__entry->err, err, sizeof(__entry->err)); + ), + + TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", @@ -85276,7 +87116,7 @@ index 000000000000..d3d9e965e702 + ), + + TP_fast_assign( -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + @@ -85321,7 +87161,7 @@ index 000000000000..d3d9e965e702 + ), + + TP_fast_assign( -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), @@ -85381,7 +87221,7 @@ index 000000000000..d3d9e965e702 + ), + + TP_fast_assign( -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos) @@ -85428,7 +87268,7 @@ index 000000000000..d3d9e965e702 + ), + + TP_fast_assign( -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->old_locks_want = old_locks_want; @@ -85537,7 +87377,7 @@ index 000000000000..d3d9e965e702 + ), + + TP_fast_assign( -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + ), + + TP_printk("%s", __entry->trans_fn) @@ -85556,7 +87396,7 @@ index 000000000000..d3d9e965e702 + ), + + TP_fast_assign( -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->bytes = bytes; + ), @@ -85585,7 +87425,7 @@ index 000000000000..d3d9e965e702 + ), + + TP_fast_assign( -+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + + __entry->btree_id = path->btree_id; @@ -85717,10 +87557,10 @@ index 64a13eb56078..0e83dfd9c20b 100644 +#endif diff --git a/kernel/locking/six.c b/kernel/locking/six.c new file mode 100644 -index 000000000000..b11660af245b +index 000000000000..39a9bd6ecd78 --- /dev/null +++ b/kernel/locking/six.c -@@ -0,0 +1,748 @@ +@@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include @@ -85871,6 +87711,14 @@ index 000000000000..b11660af245b + atomic64_add(__SIX_VAL(write_locking, 1), + &lock->state.counter); + smp_mb__after_atomic(); ++ } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) { ++ atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write), ++ &lock->state.counter); ++ /* ++ * pairs with barrier after unlock and before checking ++ * for readers in unlock path ++ */ ++ smp_mb__after_atomic(); + } + + ret = !pcpu_read_count(lock); @@ -85885,9 +87733,6 @@ index 000000000000..b11660af245b + if (ret || try) + v -= __SIX_VAL(write_locking, 1); + -+ if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write))) -+ v += __SIX_VAL(waiters, 1 << SIX_LOCK_write); -+ + if (try && !ret) { + old.v = atomic64_add_return(v, &lock->state.counter); + if (old.waiters & (1 << SIX_LOCK_read)) @@ -86060,7 +87905,11 @@ index 000000000000..b11660af245b + return true; +} + -+#ifdef CONFIG_LOCK_SPIN_ON_OWNER ++/* ++ * We don't see stable performance with SIX_LOCK_SPIN_ON_OWNER enabled, so it's ++ * off for now: ++ */ ++#ifdef SIX_LOCK_SPIN_ON_OWNER + +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait) @@ -86505,10 +88354,10 @@ index 9ed5ce989415..4f65824879ab 100644 /** * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c -index d3005279165d..b27fe1e45138 100644 +index cc65887b31bd..e62ecff562a4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c -@@ -1673,15 +1673,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +@@ -1679,15 +1679,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; @@ -86528,7 +88377,7 @@ index d3005279165d..b27fe1e45138 100644 return cnt; } -@@ -3728,11 +3728,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, +@@ -3743,11 +3743,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, static const char *show_buffer(struct trace_seq *s) { @@ -86541,7 +88390,7 @@ index d3005279165d..b27fe1e45138 100644 } static DEFINE_STATIC_KEY_FALSE(trace_no_verify); -@@ -6759,12 +6755,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, +@@ -6782,12 +6778,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, trace_access_lock(iter->cpu_file); while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; @@ -86556,7 +88405,7 @@ index d3005279165d..b27fe1e45138 100644 break; } if (ret != TRACE_TYPE_NO_CONSUME) -@@ -6786,7 +6782,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, +@@ -6809,7 +6805,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); @@ -86565,7 +88414,7 @@ index d3005279165d..b27fe1e45138 100644 trace_seq_init(&iter->seq); /* -@@ -6812,16 +6808,15 @@ static size_t +@@ -6835,16 +6831,15 @@ static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; @@ -86584,7 +88433,7 @@ index d3005279165d..b27fe1e45138 100644 break; } -@@ -6831,14 +6826,14 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) +@@ -6854,14 +6849,14 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) * anyway to be safe. */ if (ret == TRACE_TYPE_PARTIAL_LINE) { @@ -86602,7 +88451,7 @@ index d3005279165d..b27fe1e45138 100644 break; } -@@ -9826,20 +9821,8 @@ static struct notifier_block trace_die_notifier = { +@@ -9894,20 +9889,8 @@ static struct notifier_block trace_die_notifier = { void trace_printk_seq(struct trace_seq *s) { @@ -86728,7 +88577,7 @@ index 4b1057ab9d96..9d5137df1a15 100644 kfree(filter->filter_string); filter->filter_string = buf; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c -index 5e8c07aef071..914b4e5e32a5 100644 +index e310052dc83c..214b33bd7be0 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -5,13 +5,14 @@ @@ -86749,7 +88598,7 @@ index 5e8c07aef071..914b4e5e32a5 100644 #include /* for gfp flag names */ -@@ -611,7 +612,7 @@ static struct synth_field *parse_synth_field(int argc, char **argv, +@@ -622,7 +623,7 @@ static struct synth_field *parse_synth_field(int argc, char **argv, const char *prefix = NULL, *field_type = argv[0], *field_name, *array; struct synth_field *field; int len, ret = -ENOMEM; @@ -86758,7 +88607,7 @@ index 5e8c07aef071..914b4e5e32a5 100644 ssize_t size; if (!strcmp(field_type, "unsigned")) { -@@ -654,28 +655,16 @@ static struct synth_field *parse_synth_field(int argc, char **argv, +@@ -665,28 +666,16 @@ static struct synth_field *parse_synth_field(int argc, char **argv, goto free; } @@ -86793,7 +88642,7 @@ index 5e8c07aef071..914b4e5e32a5 100644 size = synth_field_size(field->type); if (size < 0) { -@@ -687,23 +676,15 @@ static struct synth_field *parse_synth_field(int argc, char **argv, +@@ -698,23 +687,15 @@ static struct synth_field *parse_synth_field(int argc, char **argv, goto free; } else if (size == 0) { if (synth_field_is_string(field->type)) { @@ -86822,7 +88671,7 @@ index 5e8c07aef071..914b4e5e32a5 100644 field->is_dynamic = true; size = sizeof(u64); -@@ -1514,7 +1495,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd) +@@ -1525,7 +1506,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd) struct synth_event *se; int ret; @@ -86849,10 +88698,10 @@ index 203204cadf92..9f270fdde99b 100644 trace_seq_puts(s, " */\n"); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c -index 23f7f0ec4f4c..2917d74bc41e 100644 +index 5a75b039e586..52174bffce95 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c -@@ -919,7 +919,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) +@@ -920,7 +920,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) static int trace_kprobe_run_command(struct dynevent_cmd *cmd) { @@ -87155,10 +89004,10 @@ index dc1ab2ed1dc6..e2938759019f 100644 bool depends on !NO_IOMEM diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index d3e5f36bb01e..0e9a76ada0f7 100644 +index cb131fad117c..91a71fe5706e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug -@@ -1656,6 +1656,15 @@ config DEBUG_CREDENTIALS +@@ -1662,6 +1662,15 @@ config DEBUG_CREDENTIALS source "kernel/rcu/Kconfig.debug" @@ -87174,6 +89023,22 @@ index d3e5f36bb01e..0e9a76ada0f7 100644 config DEBUG_WQ_FORCE_RR_CPU bool "Force round-robin CPU selection for unbound work items" depends on DEBUG_KERNEL +@@ -2049,6 +2058,15 @@ config CPUMASK_KUNIT_TEST + + If unsure, say N. + ++config MEAN_AND_VARIANCE_UNIT_TEST ++ tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS ++ depends on KUNIT ++ select MEAN_AND_VARIANCE ++ default KUNIT_ALL_TESTS ++ help ++ This option enables the kunit tests for mean_and_variance module. ++ If unsure, say N. ++ + config TEST_LIST_SORT + tristate "Linked list sorting test" if !KUNIT_ALL_TESTS + depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index ffabc30a27d4..9d9d51a116d3 100644 --- a/lib/Makefile @@ -87741,6 +89606,372 @@ index 06833d404398..9556f15ad295 100644 const u8 *ptr = buf; int i, linelen, remaining = len; unsigned char linebuf[32 * 3 + 2 + 32 + 1]; +diff --git a/lib/math/Kconfig b/lib/math/Kconfig +index 0634b428d0cb..7530ae9a3584 100644 +--- a/lib/math/Kconfig ++++ b/lib/math/Kconfig +@@ -15,3 +15,6 @@ config PRIME_NUMBERS + + config RATIONAL + tristate ++ ++config MEAN_AND_VARIANCE ++ tristate +diff --git a/lib/math/Makefile b/lib/math/Makefile +index bfac26ddfc22..2ef1487e01c2 100644 +--- a/lib/math/Makefile ++++ b/lib/math/Makefile +@@ -4,6 +4,8 @@ obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o + obj-$(CONFIG_CORDIC) += cordic.o + obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o + obj-$(CONFIG_RATIONAL) += rational.o ++obj-$(CONFIG_MEAN_AND_VARIANCE) += mean_and_variance.o + + obj-$(CONFIG_TEST_DIV64) += test_div64.o + obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o ++obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o +diff --git a/lib/math/mean_and_variance.c b/lib/math/mean_and_variance.c +new file mode 100644 +index 000000000000..643e3113500b +--- /dev/null ++++ b/lib/math/mean_and_variance.c +@@ -0,0 +1,178 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Functions for incremental mean and variance. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published by ++ * the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * Copyright © 2022 Daniel B. Hill ++ * ++ * Author: Daniel B. Hill ++ * ++ * Description: ++ * ++ * This is includes some incremental algorithms for mean and variance calculation ++ * ++ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf ++ * ++ * Create a struct and if it's the weighted variant set the w field (weight = 2^k). ++ * ++ * Use mean_and_variance[_weighted]_update() on the struct to update it's state. ++ * ++ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation ++ * is deferred to these functions for performance reasons. ++ * ++ * see lib/math/mean_and_variance_test.c for examples of usage. ++ * ++ * DO NOT access the mean and variance fields of the weighted variants directly. ++ * DO NOT change the weight after calling update. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++/** ++ * fast_divpow2() - fast approximation for n / (1 << d) ++ * @n: numerator ++ * @d: the power of 2 denominator. ++ * ++ * note: this rounds towards 0. ++ */ ++inline s64 fast_divpow2(s64 n, u8 d) ++{ ++ return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; ++} ++ ++/** ++ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 ++ * and return it. ++ * @s1: the mean_and_variance to update. ++ * @v1: the new sample. ++ * ++ * see linked pdf equation 12. ++ */ ++struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1) ++{ ++ struct mean_and_variance s2; ++ u64 v2 = abs(v1); ++ ++ s2.n = s1.n + 1; ++ s2.sum = s1.sum + v1; ++ s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2)); ++ return s2; ++} ++EXPORT_SYMBOL_GPL(mean_and_variance_update); ++ ++/** ++ * mean_and_variance_get_mean() - get mean from @s ++ */ ++s64 mean_and_variance_get_mean(struct mean_and_variance s) ++{ ++ return div64_u64(s.sum, s.n); ++} ++EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); ++ ++/** ++ * mean_and_variance_get_variance() - get variance from @s1 ++ * ++ * see linked pdf equation 12. ++ */ ++u64 mean_and_variance_get_variance(struct mean_and_variance s1) ++{ ++ u128 s2 = u128_div(s1.sum_squares, s1.n); ++ u64 s3 = abs(mean_and_variance_get_mean(s1)); ++ ++ return u128_to_u64(u128_sub(s2, u128_square(s3))); ++} ++EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); ++ ++/** ++ * mean_and_variance_get_stddev() - get standard deviation from @s ++ */ ++u32 mean_and_variance_get_stddev(struct mean_and_variance s) ++{ ++ return int_sqrt64(mean_and_variance_get_variance(s)); ++} ++EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); ++ ++/** ++ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() ++ * @s1: .. ++ * @s2: .. ++ * ++ * see linked pdf: function derived from equations 140-143 where alpha = 2^w. ++ * values are stored bitshifted for performance and added precision. ++ */ ++struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, ++ s64 x) ++{ ++ struct mean_and_variance_weighted s2; ++ // previous weighted variance. ++ u64 var_w0 = s1.variance; ++ u8 w = s2.w = s1.w; ++ // new value weighted. ++ s64 x_w = x << w; ++ s64 diff_w = x_w - s1.mean; ++ s64 diff = fast_divpow2(diff_w, w); ++ // new mean weighted. ++ s64 u_w1 = s1.mean + diff; ++ ++ BUG_ON(w % 2 != 0); ++ ++ if (!s1.init) { ++ s2.mean = x_w; ++ s2.variance = 0; ++ } else { ++ s2.mean = u_w1; ++ s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; ++ } ++ s2.init = true; ++ ++ return s2; ++} ++EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); ++ ++/** ++ * mean_and_variance_weighted_get_mean() - get mean from @s ++ */ ++s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) ++{ ++ return fast_divpow2(s.mean, s.w); ++} ++EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); ++ ++/** ++ * mean_and_variance_weighted_get_variance() -- get variance from @s ++ */ ++u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) ++{ ++ // always positive don't need fast divpow2 ++ return s.variance >> s.w; ++} ++EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); ++ ++/** ++ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s ++ */ ++u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) ++{ ++ return int_sqrt64(mean_and_variance_weighted_get_variance(s)); ++} ++EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); ++ ++MODULE_AUTHOR("Daniel B. Hill"); ++MODULE_LICENSE("GPL"); +diff --git a/lib/math/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c +new file mode 100644 +index 000000000000..4180e6baac96 +--- /dev/null ++++ b/lib/math/mean_and_variance_test.c +@@ -0,0 +1,152 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include ++#include ++ ++#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX) ++ ++static void mean_and_variance_basic_test(struct kunit *test) ++{ ++ struct mean_and_variance s = {}; ++ ++ s = mean_and_variance_update(s, 2); ++ s = mean_and_variance_update(s, 2); ++ ++ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0); ++ KUNIT_EXPECT_EQ(test, s.n, 2); ++ ++ s = mean_and_variance_update(s, 4); ++ s = mean_and_variance_update(s, 4); ++ ++ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1); ++ KUNIT_EXPECT_EQ(test, s.n, 4); ++} ++ ++/* ++ * Test values computed using a spreadsheet from the psuedocode at the bottom: ++ * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf ++ */ ++ ++static void mean_and_variance_weighted_test(struct kunit *test) ++{ ++ struct mean_and_variance_weighted s = {}; ++ ++ s.w = 2; ++ ++ s = mean_and_variance_weighted_update(s, 10); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); ++ ++ s = mean_and_variance_weighted_update(s, 20); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); ++ ++ s = mean_and_variance_weighted_update(s, 30); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); ++ ++ s = (struct mean_and_variance_weighted){}; ++ s.w = 2; ++ ++ s = mean_and_variance_weighted_update(s, -10); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); ++ ++ s = mean_and_variance_weighted_update(s, -20); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); ++ ++ s = mean_and_variance_weighted_update(s, -30); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); ++ ++} ++ ++static void mean_and_variance_weighted_advanced_test(struct kunit *test) ++{ ++ struct mean_and_variance_weighted s = {}; ++ s64 i; ++ ++ s.w = 8; ++ for (i = 10; i <= 100; i += 10) ++ s = mean_and_variance_weighted_update(s, i); ++ ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); ++ ++ s = (struct mean_and_variance_weighted){}; ++ ++ s.w = 8; ++ for (i = -10; i >= -100; i -= 10) ++ s = mean_and_variance_weighted_update(s, i); ++ ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); ++ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); ++ ++} ++ ++static void mean_and_variance_fast_divpow2(struct kunit *test) ++{ ++ s64 i; ++ u8 d; ++ ++ for (i = 0; i < 100; i++) { ++ d = 0; ++ KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d)); ++ KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d)); ++ for (d = 1; d < 32; d++) { ++ KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)), ++ div_u64(i, 1 << d), "%lld %u", i, d); ++ KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)), ++ div_u64(i, 1 << d), "%lld %u", -i, d); ++ } ++ } ++} ++ ++static void mean_and_variance_u128_basic_test(struct kunit *test) ++{ ++ u128 a = u128_shl64_add(0, U64_MAX); ++ u128 a1 = u128_shl64_add(0, 1); ++ u128 b = u128_shl64_add(1, 0); ++ u128 c = u128_shl64_add(0, 1LLU << 63); ++ u128 c2 = u128_shl64_add(U64_MAX, U64_MAX); ++ ++ KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_add(a, a1)), 1); ++ KUNIT_EXPECT_EQ(test, u128_to_u64(u128_add(a, a1)), 0); ++ KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_add(a1, a)), 1); ++ KUNIT_EXPECT_EQ(test, u128_to_u64(u128_add(a1, a)), 0); ++ ++ KUNIT_EXPECT_EQ(test, u128_to_u64(u128_sub(b, a1)), U64_MAX); ++ KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_sub(b, a1)), 0); ++ ++ KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_shl(c, 1)), 1); ++ KUNIT_EXPECT_EQ(test, u128_to_u64(u128_shl(c, 1)), 0); ++ ++ KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_square(U64_MAX)), U64_MAX - 1); ++ KUNIT_EXPECT_EQ(test, u128_to_u64(u128_square(U64_MAX)), 1); ++ ++ KUNIT_EXPECT_EQ(test, u128_to_u64(u128_div(b, 2)), 1LLU << 63); ++ ++ KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_div(c2, 2)), U64_MAX >> 1); ++ KUNIT_EXPECT_EQ(test, u128_to_u64(u128_div(c2, 2)), U64_MAX); ++ ++ KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1); ++ KUNIT_EXPECT_EQ(test, u128_to_u64(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31); ++} ++ ++static struct kunit_case mean_and_variance_test_cases[] = { ++ KUNIT_CASE(mean_and_variance_fast_divpow2), ++ KUNIT_CASE(mean_and_variance_u128_basic_test), ++ KUNIT_CASE(mean_and_variance_basic_test), ++ KUNIT_CASE(mean_and_variance_weighted_test), ++ KUNIT_CASE(mean_and_variance_weighted_advanced_test), ++ {} ++}; ++ ++static struct kunit_suite mean_and_variance_test_suite = { ++.name = "mean and variance tests", ++.test_cases = mean_and_variance_test_cases ++}; ++ ++kunit_test_suite(mean_and_variance_test_suite); diff --git a/lib/pretty-printers.c b/lib/pretty-printers.c new file mode 100644 index 000000000000..addbac95e065 @@ -89057,7 +91288,7 @@ index 4bd15a593fbd..7130ed9f56d7 100644 kfree(alloced_buffer); } diff --git a/lib/vsprintf.c b/lib/vsprintf.c -index 3c1853a9d1c0..5e78781bbca8 100644 +index 3c1853a9d1c0..51314ee7801c 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -44,6 +44,7 @@ @@ -89076,7 +91307,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 #include "kstrtox.h" /* Disable pointer hashing if requested */ -@@ -367,41 +369,51 @@ char *put_dec(char *buf, unsigned long long n) +@@ -367,41 +369,52 @@ char *put_dec(char *buf, unsigned long long n) #endif @@ -89136,6 +91367,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 +{ + prt_u64_minwidth(out, num, 0); +} ++EXPORT_SYMBOL_GPL(prt_u64); + +/* + * Convert passed number to decimal string. @@ -89154,7 +91386,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } #define SIGN 1 /* unsigned/signed, must be 1 */ -@@ -435,7 +447,8 @@ enum format_type { +@@ -435,7 +448,8 @@ enum format_type { FORMAT_TYPE_UINT, FORMAT_TYPE_INT, FORMAT_TYPE_SIZE_T, @@ -89164,7 +91396,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 }; struct printf_spec { -@@ -451,128 +464,103 @@ static_assert(sizeof(struct printf_spec) == 8); +@@ -451,128 +465,103 @@ static_assert(sizeof(struct printf_spec) == 8); #define PRECISION_MAX ((1 << 15) - 1) static noinline_for_stack @@ -89343,7 +91575,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { struct printf_spec spec; -@@ -582,25 +570,28 @@ char *special_hex_number(char *buf, char *end, unsigned long long num, int size) +@@ -582,25 +571,28 @@ char *special_hex_number(char *buf, char *end, unsigned long long num, int size) spec.base = 16; spec.precision = -1; @@ -89388,7 +91620,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } /* -@@ -612,67 +603,68 @@ static void move_right(char *buf, char *end, unsigned len, unsigned spaces) +@@ -612,67 +604,68 @@ static void move_right(char *buf, char *end, unsigned len, unsigned spaces) * Returns: new buffer position after padding. */ static noinline_for_stack @@ -89497,7 +91729,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { /* * Hard limit to avoid a completely insane messages. It actually -@@ -682,7 +674,7 @@ static char *error_string(char *buf, char *end, const char *s, +@@ -682,7 +675,7 @@ static char *error_string(char *buf, char *end, const char *s, if (spec.precision == -1) spec.precision = 2 * sizeof(void *); @@ -89506,7 +91738,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } /* -@@ -701,14 +693,15 @@ static const char *check_pointer_msg(const void *ptr) +@@ -701,14 +694,15 @@ static const char *check_pointer_msg(const void *ptr) return NULL; } @@ -89524,7 +91756,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 return -EFAULT; } -@@ -716,18 +709,50 @@ static int check_pointer(char **buf, char *end, const void *ptr, +@@ -716,18 +710,50 @@ static int check_pointer(char **buf, char *end, const void *ptr, } static noinline_for_stack @@ -89583,7 +91815,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { spec.base = 16; spec.flags |= SMALL; -@@ -736,7 +761,7 @@ static char *pointer_string(char *buf, char *end, +@@ -736,7 +762,7 @@ static char *pointer_string(char *buf, char *end, spec.flags |= ZEROPAD; } @@ -89592,7 +91824,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } /* Make pointers available for printing early in the boot sequence. */ -@@ -801,8 +826,9 @@ int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) +@@ -801,8 +827,9 @@ int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) return __ptr_to_hashval(ptr, hashval_out); } @@ -89604,7 +91836,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; unsigned long hashval; -@@ -813,47 +839,49 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, +@@ -813,47 +840,49 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, * as they are not actual addresses. */ if (IS_ERR_OR_NULL(ptr)) @@ -89665,7 +91897,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 case 1: { const struct cred *cred; -@@ -864,7 +892,7 @@ char *restricted_pointer(char *buf, char *end, const void *ptr, +@@ -864,7 +893,7 @@ char *restricted_pointer(char *buf, char *end, const void *ptr, if (in_irq() || in_serving_softirq() || in_nmi()) { if (spec.field_width == -1) spec.field_width = 2 * sizeof(ptr); @@ -89674,7 +91906,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } /* -@@ -890,17 +918,16 @@ char *restricted_pointer(char *buf, char *end, const void *ptr, +@@ -890,17 +919,16 @@ char *restricted_pointer(char *buf, char *end, const void *ptr, break; } @@ -89697,7 +91929,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 switch (fmt[1]) { case '2': case '3': case '4': -@@ -912,9 +939,9 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp +@@ -912,9 +940,9 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp rcu_read_lock(); for (i = 0; i < depth; i++, d = p) { @@ -89709,7 +91941,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } p = READ_ONCE(d->d_parent); -@@ -926,58 +953,46 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp +@@ -926,58 +954,46 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp break; } } @@ -89787,7 +92019,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { unsigned long value; #ifdef CONFIG_KALLSYMS -@@ -1000,17 +1015,12 @@ char *symbol_string(char *buf, char *end, void *ptr, +@@ -1000,17 +1016,12 @@ char *symbol_string(char *buf, char *end, void *ptr, else sprint_symbol_no_offset(sym, value); @@ -89807,7 +92039,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 static const struct printf_spec default_flag_spec = { .base = 16, .precision = -1, -@@ -1022,23 +1032,9 @@ static const struct printf_spec default_dec_spec = { +@@ -1022,23 +1033,9 @@ static const struct printf_spec default_dec_spec = { .precision = -1, }; @@ -89833,7 +92065,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { #ifndef IO_RSRC_PRINTK_SIZE #define IO_RSRC_PRINTK_SIZE 6 -@@ -1077,80 +1073,79 @@ char *resource_string(char *buf, char *end, struct resource *res, +@@ -1077,80 +1074,79 @@ char *resource_string(char *buf, char *end, struct resource *res, #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") @@ -89950,7 +92182,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 switch (fmt[1]) { case 'C': -@@ -1167,41 +1162,21 @@ char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, +@@ -1167,41 +1163,21 @@ char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, break; } @@ -89998,7 +92230,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 chunksz = nr_bits & (CHUNKSZ - 1); if (chunksz == 0) -@@ -1217,63 +1192,53 @@ char *bitmap_string(char *buf, char *end, unsigned long *bitmap, +@@ -1217,63 +1193,53 @@ char *bitmap_string(char *buf, char *end, unsigned long *bitmap, bit = i % BITS_PER_LONG; val = (bitmap[word] >> bit) & chunkmask; @@ -90080,7 +92312,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 switch (fmt[1]) { case 'F': -@@ -1291,25 +1256,23 @@ char *mac_address_string(char *buf, char *end, u8 *addr, +@@ -1291,25 +1257,23 @@ char *mac_address_string(char *buf, char *end, u8 *addr, for (i = 0; i < 6; i++) { if (reversed) @@ -90115,7 +92347,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 switch (fmt[2]) { case 'h': -@@ -1333,28 +1296,15 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt) +@@ -1333,28 +1297,15 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt) break; } for (i = 0; i < 4; i++) { @@ -90148,7 +92380,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { int i, j, range; unsigned char zerolength[8]; -@@ -1398,14 +1348,14 @@ char *ip6_compressed_string(char *p, const char *addr) +@@ -1398,14 +1349,14 @@ char *ip6_compressed_string(char *p, const char *addr) for (i = 0; i < range; i++) { if (i == colonpos) { if (needcolon || i == 0) @@ -90166,7 +92398,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 needcolon = false; } /* hex u16 without leading 0s */ -@@ -1414,81 +1364,56 @@ char *ip6_compressed_string(char *p, const char *addr) +@@ -1414,81 +1365,56 @@ char *ip6_compressed_string(char *p, const char *addr) lo = word & 0xff; if (hi) { if (hi > 0x0f) @@ -90266,7 +92498,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 fmt++; while (isalpha(*++fmt)) { -@@ -1508,44 +1433,36 @@ char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa, +@@ -1508,44 +1434,36 @@ char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa, } } @@ -90324,7 +92556,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 const u8 *addr = (const u8 *) &sa->sin_addr.s_addr; char fmt4[3] = { fmt[0], '4', 0 }; -@@ -1564,30 +1481,27 @@ char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, +@@ -1564,30 +1482,27 @@ char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, } } @@ -90364,7 +92596,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 case 'S': { const union { struct sockaddr raw; -@@ -1597,21 +1511,21 @@ char *ip_addr_string(char *buf, char *end, const void *ptr, +@@ -1597,21 +1512,21 @@ char *ip_addr_string(char *buf, char *end, const void *ptr, switch (sa->raw.sa_family) { case AF_INET: @@ -90392,7 +92624,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { bool found = true; int count = 1; -@@ -1619,10 +1533,10 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, +@@ -1619,10 +1534,10 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, int len; if (spec.field_width == 0) @@ -90406,7 +92638,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 do { switch (fmt[count++]) { -@@ -1657,44 +1571,32 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, +@@ -1657,44 +1572,32 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, flags = ESCAPE_ANY_NP; len = spec.field_width < 0 ? 1 : spec.field_width; @@ -90461,7 +92693,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 switch (*(++fmt)) { case 'L': -@@ -1710,60 +1612,54 @@ char *uuid_string(char *buf, char *end, const u8 *addr, +@@ -1710,60 +1613,54 @@ char *uuid_string(char *buf, char *end, const u8 *addr, for (i = 0; i < 16; i++) { if (uc) @@ -90537,7 +92769,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 orig = get_unaligned(fourcc); val = orig & ~BIT(31); -@@ -1772,31 +1668,27 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc, +@@ -1772,31 +1669,27 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc, unsigned char c = val >> (i * 8); /* Print non-control ASCII characters as-is, dot otherwise */ @@ -90580,7 +92812,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 switch (fmt[1]) { case 'd': -@@ -1810,55 +1702,44 @@ char *address_val(char *buf, char *end, const void *addr, +@@ -1810,55 +1703,44 @@ char *address_val(char *buf, char *end, const void *addr, break; } @@ -90654,7 +92886,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 switch (fmt[count]) { case 'd': -@@ -1886,21 +1767,16 @@ char *rtc_str(char *buf, char *end, const struct rtc_time *tm, +@@ -1886,21 +1768,16 @@ char *rtc_str(char *buf, char *end, const struct rtc_time *tm, } while (found); if (have_d) @@ -90682,7 +92914,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { struct rtc_time rtc_time; struct tm tm; -@@ -1918,47 +1794,47 @@ char *time64_str(char *buf, char *end, const time64_t time, +@@ -1918,47 +1795,47 @@ char *time64_str(char *buf, char *end, const time64_t time, rtc_time.tm_isdst = 0; @@ -90745,7 +92977,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { unsigned long mask; -@@ -1967,20 +1843,15 @@ char *format_flags(char *buf, char *end, unsigned long flags, +@@ -1967,20 +1844,15 @@ char *format_flags(char *buf, char *end, unsigned long flags, if ((flags & mask) != mask) continue; @@ -90770,7 +93002,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } struct page_flags_fields { -@@ -2005,20 +1876,18 @@ static const struct page_flags_fields pff[] = { +@@ -2005,20 +1877,18 @@ static const struct page_flags_fields pff[] = { }; static @@ -90795,7 +93027,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 append = true; } -@@ -2029,41 +1898,31 @@ char *format_page_flags(char *buf, char *end, unsigned long flags) +@@ -2029,41 +1899,31 @@ char *format_page_flags(char *buf, char *end, unsigned long flags) continue; /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */ @@ -90848,7 +93080,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 case 'v': flags = *(unsigned long *)flags_ptr; names = vmaflag_names; -@@ -2073,15 +1932,15 @@ char *flags_string(char *buf, char *end, void *flags_ptr, +@@ -2073,15 +1933,15 @@ char *flags_string(char *buf, char *end, void *flags_ptr, names = gfpflag_names; break; default: @@ -90868,7 +93100,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 { int depth; -@@ -2090,39 +1949,30 @@ char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf, +@@ -2090,39 +1950,30 @@ char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf, struct fwnode_handle *__fwnode = fwnode_get_nth_parent(fwnode, depth); @@ -90916,7 +93148,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 /* simple case without anything any more format specifiers */ fmt++; -@@ -2130,55 +1980,48 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, +@@ -2130,55 +1981,48 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, fmt = "f"; for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) { @@ -90995,7 +93227,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 has_mult = true; } -@@ -2187,38 +2030,30 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, +@@ -2187,38 +2031,30 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, break; } } @@ -91042,7 +93274,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } int __init no_hash_pointers_enable(char *str) -@@ -2374,33 +2209,40 @@ early_param("no_hash_pointers", no_hash_pointers_enable); +@@ -2374,33 +2210,40 @@ early_param("no_hash_pointers", no_hash_pointers_enable); * rendering it useful as a unique identifier. */ static noinline_for_stack @@ -91091,7 +93323,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 case 'I': /* Formatted IP supported * 4: 1.2.3.4 * 6: 0001:0203:...:0708 -@@ -2410,57 +2252,69 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, +@@ -2410,57 +2253,69 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, * 4: 001.002.003.004 * 6: 000102...0f */ @@ -91183,7 +93415,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } } -@@ -2599,8 +2453,14 @@ int format_decode(const char *fmt, struct printf_spec *spec) +@@ -2599,8 +2454,14 @@ int format_decode(const char *fmt, struct printf_spec *spec) return ++fmt - start; case 'p': @@ -91200,7 +93432,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 case '%': spec->type = FORMAT_TYPE_PERCENT_CHAR; -@@ -2681,53 +2541,89 @@ set_precision(struct printf_spec *spec, int prec) +@@ -2681,53 +2542,89 @@ set_precision(struct printf_spec *spec, int prec) } } @@ -91324,7 +93556,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 while (*fmt) { const char *old_fmt = fmt; -@@ -2736,16 +2632,9 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) +@@ -2736,16 +2633,9 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) fmt += read; switch (spec.type) { @@ -91343,7 +93575,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 case FORMAT_TYPE_WIDTH: set_field_width(&spec, va_arg(args, int)); -@@ -2755,44 +2644,60 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) +@@ -2755,44 +2645,60 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) set_precision(&spec, va_arg(args, int)); break; @@ -91429,7 +93661,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 break; case FORMAT_TYPE_INVALID: -@@ -2845,21 +2750,70 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) +@@ -2845,21 +2751,70 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) num = va_arg(args, unsigned int); } @@ -91510,7 +93742,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } EXPORT_SYMBOL(vsnprintf); -@@ -2997,53 +2951,46 @@ EXPORT_SYMBOL(sprintf); +@@ -2997,53 +2952,46 @@ EXPORT_SYMBOL(sprintf); * bstr_printf() - Binary data to text string */ @@ -91581,7 +93813,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 value; \ }) -@@ -3074,16 +3021,12 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) +@@ -3074,16 +3022,12 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) case FORMAT_TYPE_STR: { const char *save_str = va_arg(args, char *); const char *err_msg; @@ -91599,7 +93831,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 break; } -@@ -3103,12 +3046,7 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) +@@ -3103,12 +3047,7 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) save_arg(void *); break; } @@ -91613,7 +93845,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 } /* skip all alphanumeric pointer suffixes */ while (isalnum(*fmt)) -@@ -3146,15 +3084,15 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) +@@ -3146,15 +3085,15 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) } out: @@ -91633,7 +93865,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 * @fmt: The format string to use * @bin_buf: Binary arguments for the format string * -@@ -3164,26 +3102,14 @@ EXPORT_SYMBOL_GPL(vbin_printf); +@@ -3164,26 +3103,14 @@ EXPORT_SYMBOL_GPL(vbin_printf); * * The format follows C99 vsnprintf, but has some extensions: * see vsnprintf comment for details. @@ -91663,7 +93895,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 #define get_arg(type) \ ({ \ -@@ -3200,12 +3126,6 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) +@@ -3200,12 +3127,6 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) value; \ }) @@ -91676,7 +93908,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 while (*fmt) { const char *old_fmt = fmt; int read = format_decode(fmt, &spec); -@@ -3213,16 +3133,9 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) +@@ -3213,16 +3134,9 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) fmt += read; switch (spec.type) { @@ -91695,7 +93927,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 case FORMAT_TYPE_WIDTH: set_field_width(&spec, get_arg(int)); -@@ -3232,38 +3145,24 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) +@@ -3232,38 +3146,24 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) set_precision(&spec, get_arg(int)); break; @@ -91742,7 +93974,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 /* Non function dereferences were already done */ switch (*fmt) { case 'S': -@@ -3279,17 +3178,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) +@@ -3279,17 +3179,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) break; } /* Pointer dereference was already processed */ @@ -91764,7 +93996,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 while (isalnum(*fmt)) fmt++; -@@ -3297,9 +3191,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) +@@ -3297,9 +3192,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) } case FORMAT_TYPE_PERCENT_CHAR: @@ -91775,7 +94007,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644 break; case FORMAT_TYPE_INVALID: -@@ -3342,23 +3234,87 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) +@@ -3342,23 +3235,87 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) num = get_arg(int); } @@ -91886,6 +94118,18 @@ index 9a564f836403..5a649b4aebe9 100644 # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o +diff --git a/mm/filemap.c b/mm/filemap.c +index 15800334147b..3ad6eba0ab82 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -2910,6 +2910,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, + return end; + return start; + } ++EXPORT_SYMBOL(mapping_seek_hole_data); + + #ifdef CONFIG_MMU + #define MMAP_LOTSAMISS (100) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b69979c9ced5..54897e4ac4ef 100644 --- a/mm/memcontrol.c @@ -92467,5 +94711,5 @@ index 01ceb98c15a0..6219f5bbf20e 100644 static DEVICE_ATTR_RO(flags); -- -2.38.0 +2.38.1.385.g3b08839926